awacke1 commited on
Commit
3386078
·
verified ·
1 Parent(s): 11dd3f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -121
app.py CHANGED
@@ -1,21 +1,30 @@
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from selenium import webdriver
5
- from selenium.webdriver.chrome.options import Options
6
- from selenium.webdriver.common.by import By
7
  from PIL import Image
8
- import imagehash
9
- from difflib import SequenceMatcher
10
- from sklearn.feature_extraction.text import TfidfVectorizer
11
- from sklearn.metrics.pairwise import cosine_similarity
12
- import time
13
  import io
14
  import base64
15
  from urllib.parse import urljoin, urlparse
16
  import pandas as pd
17
  import plotly.graph_objects as go
18
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def initialize_session_state():
21
  if 'visited_urls' not in st.session_state:
@@ -27,16 +36,20 @@ def initialize_session_state():
27
  if 'crawl_results' not in st.session_state:
28
  st.session_state.crawl_results = []
29
 
30
- def setup_chrome_driver():
31
- chrome_options = Options()
32
- chrome_options.add_argument("--headless")
33
- chrome_options.add_argument("--no-sandbox")
34
- chrome_options.add_argument("--disable-dev-shm-usage")
35
- return webdriver.Chrome(options=chrome_options)
 
 
 
36
 
37
- def capture_screenshot(driver):
38
- screenshot = driver.get_screenshot_as_png()
39
- return Image.open(io.BytesIO(screenshot))
 
40
 
41
  def calculate_similarity(text1, text2):
42
  # Basic similarity
@@ -52,52 +65,64 @@ def calculate_similarity(text1, text2):
52
 
53
  return basic_ratio, semantic_ratio
54
 
55
- def crawl_website(url, max_pages=10, search_term=None):
56
  visited = set()
57
  to_visit = {url}
58
  results = []
59
 
60
- while to_visit and len(visited) < max_pages:
61
- current_url = to_visit.pop()
62
- if current_url in visited:
63
- continue
64
-
65
- try:
66
- response = requests.get(current_url)
67
- soup = BeautifulSoup(response.text, 'html.parser')
68
- visited.add(current_url)
69
-
70
- # Extract text content
71
- text_content = soup.get_text()
72
-
73
- # If search term provided, check for matches
74
- match_found = search_term.lower() in text_content.lower() if search_term else True
75
-
76
- if match_found:
77
- results.append({
78
- 'url': current_url,
79
- 'title': soup.title.string if soup.title else 'No title',
80
- 'content_preview': text_content[:200],
81
- 'matches_search': match_found
82
- })
83
 
84
- # Find new links
85
- for link in soup.find_all('a'):
86
- href = link.get('href')
87
- if href:
88
- absolute_url = urljoin(current_url, href)
89
- if urlparse(absolute_url).netloc == urlparse(url).netloc:
90
- to_visit.add(absolute_url)
91
-
92
- except Exception as e:
93
- st.error(f"Error crawling {current_url}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
 
 
95
  return results
96
 
97
  def main():
98
  st.title("Web Testing and Crawling Suite")
99
  initialize_session_state()
100
 
 
 
 
 
101
  # Sidebar for tool selection
102
  tool = st.sidebar.radio(
103
  "Select Tool",
@@ -111,41 +136,46 @@ def main():
111
  max_cycles = st.number_input("Number of test cycles", 1, 100, 1)
112
 
113
  if st.button("Start Testing"):
114
- driver = setup_chrome_driver()
115
-
116
- for cycle in range(max_cycles):
117
- start_time = time.time()
118
-
119
  try:
120
- driver.get(url)
121
- load_time = time.time() - start_time
122
- st.session_state.load_times.append(load_time)
123
-
124
- # Capture screenshot
125
- screenshot = capture_screenshot(driver)
126
- st.session_state.screenshots.append(screenshot)
127
-
128
- # Show results
129
- st.success(f"Cycle {cycle + 1} completed - Load time: {load_time:.2f}s")
130
- st.image(screenshot, caption=f"Screenshot - Cycle {cycle + 1}")
131
-
132
- # Plot load times
133
- fig = go.Figure(data=go.Scatter(
134
- x=list(range(1, len(st.session_state.load_times) + 1)),
135
- y=st.session_state.load_times,
136
- mode='lines+markers'
137
- ))
138
- fig.update_layout(title="Page Load Times",
139
- xaxis_title="Cycle",
140
- yaxis_title="Load Time (s)")
141
- st.plotly_chart(fig)
142
-
143
- time.sleep(interval)
144
-
145
- except Exception as e:
146
- st.error(f"Error in cycle {cycle + 1}: {str(e)}")
147
 
148
- driver.quit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  elif tool == "Crawler":
151
  st.header("Web Crawler")
@@ -154,7 +184,7 @@ def main():
154
  search_term = st.text_input("Search term (optional)")
155
 
156
  if st.button("Start Crawling"):
157
- results = crawl_website(base_url, max_pages, search_term)
158
  st.session_state.crawl_results = results
159
 
160
  # Display results
@@ -174,46 +204,45 @@ def main():
174
  url2 = st.text_input("Enter second URL (Comparison content)")
175
 
176
  if st.button("Compare Content"):
177
- driver = setup_chrome_driver()
178
-
179
- try:
180
- # Get content from first URL
181
- driver.get(url1)
182
- content1 = driver.find_element(By.TAG_NAME, "body").text
183
-
184
- # Get content from second URL
185
- driver.get(url2)
186
- content2 = driver.find_element(By.TAG_NAME, "body").text
187
-
188
- # Calculate similarities
189
- basic_ratio, semantic_ratio = calculate_similarity(content1, content2)
190
-
191
- # Display results
192
- st.subheader("Similarity Results")
193
- col1, col2 = st.columns(2)
194
-
195
- with col1:
196
- st.metric("Basic Similarity", f"{basic_ratio:.2%}")
197
-
198
- with col2:
199
- st.metric("Semantic Similarity", f"{semantic_ratio:.2%}")
200
-
201
- # Show content previews
202
- st.subheader("Content Previews")
203
- st.text_area("Content 1 (First 500 chars)", content1[:500])
204
- st.text_area("Content 2 (First 500 chars)", content2[:500])
205
-
206
- except Exception as e:
207
- st.error(f"Error comparing content: {str(e)}")
208
-
209
- finally:
210
- driver.quit()
211
 
212
  if __name__ == "__main__":
213
  main()
214
-
215
 
216
- AppGoals="""
217
  Computer Use
218
  1. Browser based testing app
219
  2. similar to apps I wrote years ago which would operate a browser then run tests against my web apps including being able to compare any image or text content together to search results from one of my ai programs to determine content overlap which is then used to evaluate the results and update my ai model context data to store anything that was found that adds to the original idea. When I looked at this problem before I found chrome driver for automatic testing, saucelabs which can kind of do it, and then some python testing libraries which could do it. Can you enlighten me on which python libraries and potenitally dev tools which would help me with this to automate my testing and evaluation of my ai generated content which resides at many different URLs on huggingface as running apps
 
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
 
 
 
4
  from PIL import Image
 
 
 
 
 
5
  import io
6
  import base64
7
  from urllib.parse import urljoin, urlparse
8
  import pandas as pd
9
  import plotly.graph_objects as go
10
  import numpy as np
11
+ from difflib import SequenceMatcher
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ import time
15
+ import asyncio
16
+ from playwright.sync_api import sync_playwright
17
+ import sys
18
+ import subprocess
19
+
20
+ def install_playwright_deps():
21
+ try:
22
+ from playwright.sync_api import sync_playwright
23
+ # Install browsers if not already installed
24
+ subprocess.run(['playwright', 'install'], check=True)
25
+ except Exception as e:
26
+ st.error(f"Error installing Playwright dependencies: {str(e)}")
27
+ st.info("Try running 'pip install playwright' and 'playwright install' manually")
28
 
29
  def initialize_session_state():
30
  if 'visited_urls' not in st.session_state:
 
36
  if 'crawl_results' not in st.session_state:
37
  st.session_state.crawl_results = []
38
 
39
+ def setup_browser():
40
+ """Initialize Playwright browser"""
41
+ try:
42
+ playwright = sync_playwright().start()
43
+ browser = playwright.chromium.launch(headless=True)
44
+ return playwright, browser
45
+ except Exception as e:
46
+ st.error(f"Error setting up browser: {str(e)}")
47
+ return None, None
48
 
49
+ def capture_screenshot(page):
50
+ """Capture screenshot using Playwright"""
51
+ screenshot_bytes = page.screenshot()
52
+ return Image.open(io.BytesIO(screenshot_bytes))
53
 
54
  def calculate_similarity(text1, text2):
55
  # Basic similarity
 
65
 
66
  return basic_ratio, semantic_ratio
67
 
68
+ async def crawl_website(url, max_pages=10, search_term=None):
69
  visited = set()
70
  to_visit = {url}
71
  results = []
72
 
73
+ try:
74
+ with sync_playwright() as p:
75
+ browser = p.chromium.launch(headless=True)
76
+ page = browser.new_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ while to_visit and len(visited) < max_pages:
79
+ current_url = to_visit.pop()
80
+ if current_url in visited:
81
+ continue
82
+
83
+ try:
84
+ page.goto(current_url, wait_until="networkidle")
85
+ visited.add(current_url)
86
+
87
+ # Extract text content
88
+ text_content = page.content()
89
+
90
+ # If search term provided, check for matches
91
+ match_found = search_term.lower() in text_content.lower() if search_term else True
92
+
93
+ if match_found:
94
+ results.append({
95
+ 'url': current_url,
96
+ 'title': page.title(),
97
+ 'content_preview': text_content[:200],
98
+ 'matches_search': match_found
99
+ })
100
+
101
+ # Find new links
102
+ links = page.eval_on_selector_all('a[href]', 'elements => elements.map(el => el.href)')
103
+ for href in links:
104
+ absolute_url = urljoin(current_url, href)
105
+ if urlparse(absolute_url).netloc == urlparse(url).netloc:
106
+ to_visit.add(absolute_url)
107
+
108
+ except Exception as e:
109
+ st.error(f"Error crawling {current_url}: {str(e)}")
110
+
111
+ browser.close()
112
 
113
+ except Exception as e:
114
+ st.error(f"Error in crawl process: {str(e)}")
115
+
116
  return results
117
 
118
  def main():
119
  st.title("Web Testing and Crawling Suite")
120
  initialize_session_state()
121
 
122
+ # Install dependencies if needed
123
+ with st.spinner("Checking dependencies..."):
124
+ install_playwright_deps()
125
+
126
  # Sidebar for tool selection
127
  tool = st.sidebar.radio(
128
  "Select Tool",
 
136
  max_cycles = st.number_input("Number of test cycles", 1, 100, 1)
137
 
138
  if st.button("Start Testing"):
139
+ playwright, browser = setup_browser()
140
+ if playwright and browser:
 
 
 
141
  try:
142
+ page = browser.new_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ for cycle in range(max_cycles):
145
+ start_time = time.time()
146
+
147
+ try:
148
+ page.goto(url, wait_until="networkidle")
149
+ load_time = time.time() - start_time
150
+ st.session_state.load_times.append(load_time)
151
+
152
+ # Capture screenshot
153
+ screenshot = capture_screenshot(page)
154
+ st.session_state.screenshots.append(screenshot)
155
+
156
+ # Show results
157
+ st.success(f"Cycle {cycle + 1} completed - Load time: {load_time:.2f}s")
158
+ st.image(screenshot, caption=f"Screenshot - Cycle {cycle + 1}")
159
+
160
+ # Plot load times
161
+ fig = go.Figure(data=go.Scatter(
162
+ x=list(range(1, len(st.session_state.load_times) + 1)),
163
+ y=st.session_state.load_times,
164
+ mode='lines+markers'
165
+ ))
166
+ fig.update_layout(title="Page Load Times",
167
+ xaxis_title="Cycle",
168
+ yaxis_title="Load Time (s)")
169
+ st.plotly_chart(fig)
170
+
171
+ time.sleep(interval)
172
+
173
+ except Exception as e:
174
+ st.error(f"Error in cycle {cycle + 1}: {str(e)}")
175
+
176
+ finally:
177
+ browser.close()
178
+ playwright.stop()
179
 
180
  elif tool == "Crawler":
181
  st.header("Web Crawler")
 
184
  search_term = st.text_input("Search term (optional)")
185
 
186
  if st.button("Start Crawling"):
187
+ results = asyncio.run(crawl_website(base_url, max_pages, search_term))
188
  st.session_state.crawl_results = results
189
 
190
  # Display results
 
204
  url2 = st.text_input("Enter second URL (Comparison content)")
205
 
206
  if st.button("Compare Content"):
207
+ playwright, browser = setup_browser()
208
+ if playwright and browser:
209
+ try:
210
+ page = browser.new_page()
211
+
212
+ # Get content from first URL
213
+ page.goto(url1, wait_until="networkidle")
214
+ content1 = page.content()
215
+
216
+ # Get content from second URL
217
+ page.goto(url2, wait_until="networkidle")
218
+ content2 = page.content()
219
+
220
+ # Calculate similarities
221
+ basic_ratio, semantic_ratio = calculate_similarity(content1, content2)
222
+
223
+ # Display results
224
+ st.subheader("Similarity Results")
225
+ col1, col2 = st.columns(2)
226
+
227
+ with col1:
228
+ st.metric("Basic Similarity", f"{basic_ratio:.2%}")
229
+
230
+ with col2:
231
+ st.metric("Semantic Similarity", f"{semantic_ratio:.2%}")
232
+
233
+ # Show content previews
234
+ st.subheader("Content Previews")
235
+ st.text_area("Content 1 (First 500 chars)", content1[:500])
236
+ st.text_area("Content 2 (First 500 chars)", content2[:500])
237
+
238
+ finally:
239
+ browser.close()
240
+ playwright.stop()
241
 
242
  if __name__ == "__main__":
243
  main()
 
244
 
245
+ Goals="""
246
  Computer Use
247
  1. Browser based testing app
248
  2. similar to apps I wrote years ago which would operate a browser then run tests against my web apps including being able to compare any image or text content together to search results from one of my ai programs to determine content overlap which is then used to evaluate the results and update my ai model context data to store anything that was found that adds to the original idea. When I looked at this problem before I found chrome driver for automatic testing, saucelabs which can kind of do it, and then some python testing libraries which could do it. Can you enlighten me on which python libraries and potenitally dev tools which would help me with this to automate my testing and evaluation of my ai generated content which resides at many different URLs on huggingface as running apps