Ashgen12 commited on
Commit
b399279
·
verified ·
1 Parent(s): 3fee850
Files changed (1) hide show
  1. scraper.py +159 -149
scraper.py CHANGED
@@ -1,150 +1,160 @@
1
- # ai_test_generator/scraper.py
2
- import time
3
- import json
4
- from selenium import webdriver
5
- from selenium.webdriver.chrome.service import Service
6
- from selenium.webdriver.chrome.options import Options
7
- from webdriver_manager.chrome import ChromeDriverManager
8
- from bs4 import BeautifulSoup
9
- import logging
10
-
11
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
-
13
- def extract_elements(url: str) -> list[dict]:
14
- """
15
- Scrapes a website URL to extract buttons, links, input fields, and forms.
16
-
17
- Args:
18
- url: The public URL of the website to scrape.
19
-
20
- Returns:
21
- A list of dictionaries, each representing an extracted UI element.
22
- Returns an empty list if scraping fails.
23
- """
24
- logging.info(f"Starting scraping for URL: {url}")
25
- extracted_elements = []
26
-
27
- chrome_options = Options()
28
- chrome_options.add_argument("--headless") # Run headless (no GUI)
29
- chrome_options.add_argument("--no-sandbox")
30
- chrome_options.add_argument("--disable-dev-shm-usage")
31
- chrome_options.add_argument("--disable-gpu") # Recommended for headless
32
- chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent
33
-
34
- service = Service(ChromeDriverManager().install())
35
- driver = None
36
-
37
- try:
38
- driver = webdriver.Chrome(service=service, options=chrome_options)
39
- driver.set_page_load_timeout(30) # Set timeout for page load
40
- driver.get(url)
41
- # Allow some time for dynamic content to potentially load
42
- # A more robust solution might use WebDriverWait
43
- time.sleep(3)
44
-
45
- page_source = driver.page_source
46
- soup = BeautifulSoup(page_source, 'lxml') # Use lxml parser
47
-
48
- # --- Extract Buttons ---
49
- buttons = soup.find_all('button')
50
- for btn in buttons:
51
- element_data = {
52
- 'type': 'button',
53
- 'text': btn.get_text(strip=True),
54
- 'id': btn.get('id'),
55
- 'name': btn.get('name'),
56
- 'class': btn.get('class'),
57
- 'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']}
58
- }
59
- extracted_elements.append(element_data)
60
- logging.info(f"Found {len(buttons)} buttons.")
61
-
62
- # --- Extract Links ---
63
- links = soup.find_all('a')
64
- for link in links:
65
- element_data = {
66
- 'type': 'link',
67
- 'text': link.get_text(strip=True),
68
- 'href': link.get('href'),
69
- 'id': link.get('id'),
70
- 'class': link.get('class'),
71
- 'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']}
72
- }
73
- extracted_elements.append(element_data)
74
- logging.info(f"Found {len(links)} links.")
75
-
76
- # --- Extract Input Fields ---
77
- inputs = soup.find_all('input')
78
- for inp in inputs:
79
- element_data = {
80
- 'type': 'input',
81
- 'input_type': inp.get('type', 'text'), # Default to 'text' if type not specified
82
- 'id': inp.get('id'),
83
- 'name': inp.get('name'),
84
- 'placeholder': inp.get('placeholder'),
85
- 'value': inp.get('value'),
86
- 'class': inp.get('class'),
87
- 'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']}
88
- }
89
- extracted_elements.append(element_data)
90
- logging.info(f"Found {len(inputs)} input fields.")
91
-
92
- # --- Extract Forms ---
93
- forms = soup.find_all('form')
94
- for form in forms:
95
- form_elements = []
96
- # Find elements within this specific form
97
- for child_input in form.find_all('input'):
98
- form_elements.append({
99
- 'tag': 'input',
100
- 'type': child_input.get('type'),
101
- 'id': child_input.get('id'),
102
- 'name': child_input.get('name')
103
- })
104
- for child_button in form.find_all('button'):
105
- form_elements.append({
106
- 'tag': 'button',
107
- 'type': child_button.get('type'),
108
- 'id': child_button.get('id'),
109
- 'name': child_button.get('name'),
110
- 'text': child_button.get_text(strip=True)
111
- })
112
- # Add other form element types if needed (select, textarea)
113
-
114
- element_data = {
115
- 'type': 'form',
116
- 'id': form.get('id'),
117
- 'action': form.get('action'),
118
- 'method': form.get('method'),
119
- 'class': form.get('class'),
120
- 'contained_elements': form_elements,
121
- 'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']}
122
- }
123
- extracted_elements.append(element_data)
124
- logging.info(f"Found {len(forms)} forms.")
125
-
126
- logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.")
127
-
128
- except Exception as e:
129
- logging.error(f"Error during scraping URL {url}: {e}", exc_info=True)
130
- # Return empty list on error, Gradio app will handle this
131
- return []
132
- finally:
133
- if driver:
134
- driver.quit()
135
- logging.info("WebDriver closed.")
136
-
137
- return extracted_elements
138
-
139
- # Example usage (optional, for testing scraper independently)
140
- # if __name__ == '__main__':
141
- # test_url = "https://demoblaze.com/"
142
- # elements = extract_elements(test_url)
143
- # if elements:
144
- # print(f"Extracted {len(elements)} elements.")
145
- # # Save to a temporary file for inspection
146
- # with open("temp_elements.json", "w", encoding="utf-8") as f:
147
- # json.dump(elements, f, indent=4)
148
- # print("Saved results to temp_elements.json")
149
- # else:
 
 
 
 
 
 
 
 
 
 
150
  # print("Scraping failed.")
 
1
+ # ai_test_generator/scraper.py
2
+ import time
3
+ import json
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
+ from selenium.webdriver.chrome.options import Options
7
+ # from webdriver_manager.chrome import ChromeDriverManager
8
+ from bs4 import BeautifulSoup
9
+ import logging
10
+
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
+
13
+ def extract_elements(url: str) -> list[dict]:
14
+ """
15
+ Scrapes a website URL to extract buttons, links, input fields, and forms.
16
+
17
+ Args:
18
+ url: The public URL of the website to scrape.
19
+
20
+ Returns:
21
+ A list of dictionaries, each representing an extracted UI element.
22
+ Returns an empty list if scraping fails.
23
+ """
24
+ logging.info(f"Starting scraping for URL: {url}")
25
+ extracted_elements = []
26
+
27
+ chrome_options = Options()
28
+ chrome_options.add_argument("--headless") # Run headless (no GUI)
29
+ chrome_options.add_argument("--no-sandbox")
30
+ chrome_options.add_argument("--disable-dev-shm-usage")
31
+ chrome_options.add_argument("--disable-gpu") # Recommended for headless
32
+ chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent
33
+
34
+ # service = Service(ChromeDriverManager().install())
35
+ driver = None
36
+
37
+ try:
38
+ # --- Use the system's ChromeDriver ---
39
+ # Specify the path to the driver installed via packages.txt
40
+ chromedriver_path = "/usr/bin/chromedriver"
41
+ logging.info(f"Attempting to use system chromedriver at: {chromedriver_path}")
42
+ service = Service(executable_path=chromedriver_path)
43
+ # --- End of change ---
44
+
45
+ driver = webdriver.Chrome(service=service, options=chrome_options)
46
+ driver.set_page_load_timeout(30) # Set timeout for page load
47
+ driver.get(url)
48
+ # Allow some time for dynamic content to potentially load
49
+ # A more robust solution might use WebDriverWait
50
+ time.sleep(3)
51
+
52
+ page_source = driver.page_source
53
+ soup = BeautifulSoup(page_source, 'lxml') # Use lxml parser
54
+
55
+ # --- Extract Buttons ---
56
+ buttons = soup.find_all('button')
57
+ for btn in buttons:
58
+ element_data = {
59
+ 'type': 'button',
60
+ 'text': btn.get_text(strip=True),
61
+ 'id': btn.get('id'),
62
+ 'name': btn.get('name'),
63
+ 'class': btn.get('class'),
64
+ 'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']}
65
+ }
66
+ extracted_elements.append(element_data)
67
+ logging.info(f"Found {len(buttons)} buttons.")
68
+
69
+ # --- Extract Links ---
70
+ links = soup.find_all('a')
71
+ for link in links:
72
+ element_data = {
73
+ 'type': 'link',
74
+ 'text': link.get_text(strip=True),
75
+ 'href': link.get('href'),
76
+ 'id': link.get('id'),
77
+ 'class': link.get('class'),
78
+ 'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']}
79
+ }
80
+ extracted_elements.append(element_data)
81
+ logging.info(f"Found {len(links)} links.")
82
+
83
+ # --- Extract Input Fields ---
84
+ inputs = soup.find_all('input')
85
+ for inp in inputs:
86
+ element_data = {
87
+ 'type': 'input',
88
+ 'input_type': inp.get('type', 'text'), # Default to 'text' if type not specified
89
+ 'id': inp.get('id'),
90
+ 'name': inp.get('name'),
91
+ 'placeholder': inp.get('placeholder'),
92
+ 'value': inp.get('value'),
93
+ 'class': inp.get('class'),
94
+ 'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']}
95
+ }
96
+ extracted_elements.append(element_data)
97
+ logging.info(f"Found {len(inputs)} input fields.")
98
+
99
+ # --- Extract Forms ---
100
+ forms = soup.find_all('form')
101
+ for form in forms:
102
+ form_elements = []
103
+ # Find elements within this specific form
104
+ for child_input in form.find_all('input'):
105
+ form_elements.append({
106
+ 'tag': 'input',
107
+ 'type': child_input.get('type'),
108
+ 'id': child_input.get('id'),
109
+ 'name': child_input.get('name')
110
+ })
111
+ for child_button in form.find_all('button'):
112
+ form_elements.append({
113
+ 'tag': 'button',
114
+ 'type': child_button.get('type'),
115
+ 'id': child_button.get('id'),
116
+ 'name': child_button.get('name'),
117
+ 'text': child_button.get_text(strip=True)
118
+ })
119
+ # Add other form element types if needed (select, textarea)
120
+
121
+ element_data = {
122
+ 'type': 'form',
123
+ 'id': form.get('id'),
124
+ 'action': form.get('action'),
125
+ 'method': form.get('method'),
126
+ 'class': form.get('class'),
127
+ 'contained_elements': form_elements,
128
+ 'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']}
129
+ }
130
+ extracted_elements.append(element_data)
131
+ logging.info(f"Found {len(forms)} forms.")
132
+
133
+ logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.")
134
+
135
+ except FileNotFoundError:
136
+ logging.error(f"ERROR: System Chromedriver not found at {chromedriver_path}. Make sure 'chromium-driver' is in packages.txt.")
137
+
138
+ except Exception as e:
139
+ logging.error(f"Error during scraping URL {url}: {e}", exc_info=True)
140
+ # Return empty list on error, Gradio app will handle this
141
+ return []
142
+ finally:
143
+ if driver:
144
+ driver.quit()
145
+ logging.info("WebDriver closed.")
146
+
147
+ return extracted_elements
148
+
149
+ # Example usage (optional, for testing scraper independently)
150
+ # if __name__ == '__main__':
151
+ # test_url = "https://demoblaze.com/"
152
+ # elements = extract_elements(test_url)
153
+ # if elements:
154
+ # print(f"Extracted {len(elements)} elements.")
155
+ # # Save to a temporary file for inspection
156
+ # with open("temp_elements.json", "w", encoding="utf-8") as f:
157
+ # json.dump(elements, f, indent=4)
158
+ # print("Saved results to temp_elements.json")
159
+ # else:
160
  # print("Scraping failed.")