Update app.py
Browse files
app.py
CHANGED
@@ -1,21 +1,30 @@
|
|
1 |
import streamlit as st
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
-
from selenium import webdriver
|
5 |
-
from selenium.webdriver.chrome.options import Options
|
6 |
-
from selenium.webdriver.common.by import By
|
7 |
from PIL import Image
|
8 |
-
import imagehash
|
9 |
-
from difflib import SequenceMatcher
|
10 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
-
import time
|
13 |
import io
|
14 |
import base64
|
15 |
from urllib.parse import urljoin, urlparse
|
16 |
import pandas as pd
|
17 |
import plotly.graph_objects as go
|
18 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def initialize_session_state():
|
21 |
if 'visited_urls' not in st.session_state:
|
@@ -27,16 +36,20 @@ def initialize_session_state():
|
|
27 |
if 'crawl_results' not in st.session_state:
|
28 |
st.session_state.crawl_results = []
|
29 |
|
30 |
-
def
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
36 |
|
37 |
-
def capture_screenshot(
|
38 |
-
screenshot
|
39 |
-
|
|
|
40 |
|
41 |
def calculate_similarity(text1, text2):
|
42 |
# Basic similarity
|
@@ -52,52 +65,64 @@ def calculate_similarity(text1, text2):
|
|
52 |
|
53 |
return basic_ratio, semantic_ratio
|
54 |
|
55 |
-
def crawl_website(url, max_pages=10, search_term=None):
|
56 |
visited = set()
|
57 |
to_visit = {url}
|
58 |
results = []
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
try:
|
66 |
-
response = requests.get(current_url)
|
67 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
68 |
-
visited.add(current_url)
|
69 |
-
|
70 |
-
# Extract text content
|
71 |
-
text_content = soup.get_text()
|
72 |
-
|
73 |
-
# If search term provided, check for matches
|
74 |
-
match_found = search_term.lower() in text_content.lower() if search_term else True
|
75 |
-
|
76 |
-
if match_found:
|
77 |
-
results.append({
|
78 |
-
'url': current_url,
|
79 |
-
'title': soup.title.string if soup.title else 'No title',
|
80 |
-
'content_preview': text_content[:200],
|
81 |
-
'matches_search': match_found
|
82 |
-
})
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
|
|
|
|
|
|
95 |
return results
|
96 |
|
97 |
def main():
|
98 |
st.title("Web Testing and Crawling Suite")
|
99 |
initialize_session_state()
|
100 |
|
|
|
|
|
|
|
|
|
101 |
# Sidebar for tool selection
|
102 |
tool = st.sidebar.radio(
|
103 |
"Select Tool",
|
@@ -111,41 +136,46 @@ def main():
|
|
111 |
max_cycles = st.number_input("Number of test cycles", 1, 100, 1)
|
112 |
|
113 |
if st.button("Start Testing"):
|
114 |
-
|
115 |
-
|
116 |
-
for cycle in range(max_cycles):
|
117 |
-
start_time = time.time()
|
118 |
-
|
119 |
try:
|
120 |
-
|
121 |
-
load_time = time.time() - start_time
|
122 |
-
st.session_state.load_times.append(load_time)
|
123 |
-
|
124 |
-
# Capture screenshot
|
125 |
-
screenshot = capture_screenshot(driver)
|
126 |
-
st.session_state.screenshots.append(screenshot)
|
127 |
-
|
128 |
-
# Show results
|
129 |
-
st.success(f"Cycle {cycle + 1} completed - Load time: {load_time:.2f}s")
|
130 |
-
st.image(screenshot, caption=f"Screenshot - Cycle {cycle + 1}")
|
131 |
-
|
132 |
-
# Plot load times
|
133 |
-
fig = go.Figure(data=go.Scatter(
|
134 |
-
x=list(range(1, len(st.session_state.load_times) + 1)),
|
135 |
-
y=st.session_state.load_times,
|
136 |
-
mode='lines+markers'
|
137 |
-
))
|
138 |
-
fig.update_layout(title="Page Load Times",
|
139 |
-
xaxis_title="Cycle",
|
140 |
-
yaxis_title="Load Time (s)")
|
141 |
-
st.plotly_chart(fig)
|
142 |
-
|
143 |
-
time.sleep(interval)
|
144 |
-
|
145 |
-
except Exception as e:
|
146 |
-
st.error(f"Error in cycle {cycle + 1}: {str(e)}")
|
147 |
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
elif tool == "Crawler":
|
151 |
st.header("Web Crawler")
|
@@ -154,7 +184,7 @@ def main():
|
|
154 |
search_term = st.text_input("Search term (optional)")
|
155 |
|
156 |
if st.button("Start Crawling"):
|
157 |
-
results = crawl_website(base_url, max_pages, search_term)
|
158 |
st.session_state.crawl_results = results
|
159 |
|
160 |
# Display results
|
@@ -174,46 +204,45 @@ def main():
|
|
174 |
url2 = st.text_input("Enter second URL (Comparison content)")
|
175 |
|
176 |
if st.button("Compare Content"):
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
|
212 |
if __name__ == "__main__":
|
213 |
main()
|
214 |
-
|
215 |
|
216 |
-
|
217 |
Computer Use
|
218 |
1. Browser based testing app
|
219 |
2. similar to apps I wrote years ago which would operate a browser then run tests against my web apps including being able to compare any image or text content together to search results from one of my ai programs to determine content overlap which is then used to evaluate the results and update my ai model context data to store anything that was found that adds to the original idea. When I looked at this problem before I found chrome driver for automatic testing, saucelabs which can kind of do it, and then some python testing libraries which could do it. Can you enlighten me on which python libraries and potenitally dev tools which would help me with this to automate my testing and evaluation of my ai generated content which resides at many different URLs on huggingface as running apps
|
|
|
1 |
import streamlit as st
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
4 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
5 |
import io
|
6 |
import base64
|
7 |
from urllib.parse import urljoin, urlparse
|
8 |
import pandas as pd
|
9 |
import plotly.graph_objects as go
|
10 |
import numpy as np
|
11 |
+
from difflib import SequenceMatcher
|
12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
14 |
+
import time
|
15 |
+
import asyncio
|
16 |
+
from playwright.sync_api import sync_playwright
|
17 |
+
import sys
|
18 |
+
import subprocess
|
19 |
+
|
20 |
+
def install_playwright_deps():
|
21 |
+
try:
|
22 |
+
from playwright.sync_api import sync_playwright
|
23 |
+
# Install browsers if not already installed
|
24 |
+
subprocess.run(['playwright', 'install'], check=True)
|
25 |
+
except Exception as e:
|
26 |
+
st.error(f"Error installing Playwright dependencies: {str(e)}")
|
27 |
+
st.info("Try running 'pip install playwright' and 'playwright install' manually")
|
28 |
|
29 |
def initialize_session_state():
|
30 |
if 'visited_urls' not in st.session_state:
|
|
|
36 |
if 'crawl_results' not in st.session_state:
|
37 |
st.session_state.crawl_results = []
|
38 |
|
39 |
+
def setup_browser():
|
40 |
+
"""Initialize Playwright browser"""
|
41 |
+
try:
|
42 |
+
playwright = sync_playwright().start()
|
43 |
+
browser = playwright.chromium.launch(headless=True)
|
44 |
+
return playwright, browser
|
45 |
+
except Exception as e:
|
46 |
+
st.error(f"Error setting up browser: {str(e)}")
|
47 |
+
return None, None
|
48 |
|
49 |
+
def capture_screenshot(page):
|
50 |
+
"""Capture screenshot using Playwright"""
|
51 |
+
screenshot_bytes = page.screenshot()
|
52 |
+
return Image.open(io.BytesIO(screenshot_bytes))
|
53 |
|
54 |
def calculate_similarity(text1, text2):
|
55 |
# Basic similarity
|
|
|
65 |
|
66 |
return basic_ratio, semantic_ratio
|
67 |
|
68 |
+
async def crawl_website(url, max_pages=10, search_term=None):
|
69 |
visited = set()
|
70 |
to_visit = {url}
|
71 |
results = []
|
72 |
|
73 |
+
try:
|
74 |
+
with sync_playwright() as p:
|
75 |
+
browser = p.chromium.launch(headless=True)
|
76 |
+
page = browser.new_page()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
while to_visit and len(visited) < max_pages:
|
79 |
+
current_url = to_visit.pop()
|
80 |
+
if current_url in visited:
|
81 |
+
continue
|
82 |
+
|
83 |
+
try:
|
84 |
+
page.goto(current_url, wait_until="networkidle")
|
85 |
+
visited.add(current_url)
|
86 |
+
|
87 |
+
# Extract text content
|
88 |
+
text_content = page.content()
|
89 |
+
|
90 |
+
# If search term provided, check for matches
|
91 |
+
match_found = search_term.lower() in text_content.lower() if search_term else True
|
92 |
+
|
93 |
+
if match_found:
|
94 |
+
results.append({
|
95 |
+
'url': current_url,
|
96 |
+
'title': page.title(),
|
97 |
+
'content_preview': text_content[:200],
|
98 |
+
'matches_search': match_found
|
99 |
+
})
|
100 |
+
|
101 |
+
# Find new links
|
102 |
+
links = page.eval_on_selector_all('a[href]', 'elements => elements.map(el => el.href)')
|
103 |
+
for href in links:
|
104 |
+
absolute_url = urljoin(current_url, href)
|
105 |
+
if urlparse(absolute_url).netloc == urlparse(url).netloc:
|
106 |
+
to_visit.add(absolute_url)
|
107 |
+
|
108 |
+
except Exception as e:
|
109 |
+
st.error(f"Error crawling {current_url}: {str(e)}")
|
110 |
+
|
111 |
+
browser.close()
|
112 |
|
113 |
+
except Exception as e:
|
114 |
+
st.error(f"Error in crawl process: {str(e)}")
|
115 |
+
|
116 |
return results
|
117 |
|
118 |
def main():
|
119 |
st.title("Web Testing and Crawling Suite")
|
120 |
initialize_session_state()
|
121 |
|
122 |
+
# Install dependencies if needed
|
123 |
+
with st.spinner("Checking dependencies..."):
|
124 |
+
install_playwright_deps()
|
125 |
+
|
126 |
# Sidebar for tool selection
|
127 |
tool = st.sidebar.radio(
|
128 |
"Select Tool",
|
|
|
136 |
max_cycles = st.number_input("Number of test cycles", 1, 100, 1)
|
137 |
|
138 |
if st.button("Start Testing"):
|
139 |
+
playwright, browser = setup_browser()
|
140 |
+
if playwright and browser:
|
|
|
|
|
|
|
141 |
try:
|
142 |
+
page = browser.new_page()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
+
for cycle in range(max_cycles):
|
145 |
+
start_time = time.time()
|
146 |
+
|
147 |
+
try:
|
148 |
+
page.goto(url, wait_until="networkidle")
|
149 |
+
load_time = time.time() - start_time
|
150 |
+
st.session_state.load_times.append(load_time)
|
151 |
+
|
152 |
+
# Capture screenshot
|
153 |
+
screenshot = capture_screenshot(page)
|
154 |
+
st.session_state.screenshots.append(screenshot)
|
155 |
+
|
156 |
+
# Show results
|
157 |
+
st.success(f"Cycle {cycle + 1} completed - Load time: {load_time:.2f}s")
|
158 |
+
st.image(screenshot, caption=f"Screenshot - Cycle {cycle + 1}")
|
159 |
+
|
160 |
+
# Plot load times
|
161 |
+
fig = go.Figure(data=go.Scatter(
|
162 |
+
x=list(range(1, len(st.session_state.load_times) + 1)),
|
163 |
+
y=st.session_state.load_times,
|
164 |
+
mode='lines+markers'
|
165 |
+
))
|
166 |
+
fig.update_layout(title="Page Load Times",
|
167 |
+
xaxis_title="Cycle",
|
168 |
+
yaxis_title="Load Time (s)")
|
169 |
+
st.plotly_chart(fig)
|
170 |
+
|
171 |
+
time.sleep(interval)
|
172 |
+
|
173 |
+
except Exception as e:
|
174 |
+
st.error(f"Error in cycle {cycle + 1}: {str(e)}")
|
175 |
+
|
176 |
+
finally:
|
177 |
+
browser.close()
|
178 |
+
playwright.stop()
|
179 |
|
180 |
elif tool == "Crawler":
|
181 |
st.header("Web Crawler")
|
|
|
184 |
search_term = st.text_input("Search term (optional)")
|
185 |
|
186 |
if st.button("Start Crawling"):
|
187 |
+
results = asyncio.run(crawl_website(base_url, max_pages, search_term))
|
188 |
st.session_state.crawl_results = results
|
189 |
|
190 |
# Display results
|
|
|
204 |
url2 = st.text_input("Enter second URL (Comparison content)")
|
205 |
|
206 |
if st.button("Compare Content"):
|
207 |
+
playwright, browser = setup_browser()
|
208 |
+
if playwright and browser:
|
209 |
+
try:
|
210 |
+
page = browser.new_page()
|
211 |
+
|
212 |
+
# Get content from first URL
|
213 |
+
page.goto(url1, wait_until="networkidle")
|
214 |
+
content1 = page.content()
|
215 |
+
|
216 |
+
# Get content from second URL
|
217 |
+
page.goto(url2, wait_until="networkidle")
|
218 |
+
content2 = page.content()
|
219 |
+
|
220 |
+
# Calculate similarities
|
221 |
+
basic_ratio, semantic_ratio = calculate_similarity(content1, content2)
|
222 |
+
|
223 |
+
# Display results
|
224 |
+
st.subheader("Similarity Results")
|
225 |
+
col1, col2 = st.columns(2)
|
226 |
+
|
227 |
+
with col1:
|
228 |
+
st.metric("Basic Similarity", f"{basic_ratio:.2%}")
|
229 |
+
|
230 |
+
with col2:
|
231 |
+
st.metric("Semantic Similarity", f"{semantic_ratio:.2%}")
|
232 |
+
|
233 |
+
# Show content previews
|
234 |
+
st.subheader("Content Previews")
|
235 |
+
st.text_area("Content 1 (First 500 chars)", content1[:500])
|
236 |
+
st.text_area("Content 2 (First 500 chars)", content2[:500])
|
237 |
+
|
238 |
+
finally:
|
239 |
+
browser.close()
|
240 |
+
playwright.stop()
|
241 |
|
242 |
if __name__ == "__main__":
|
243 |
main()
|
|
|
244 |
|
245 |
+
Goals="""
|
246 |
Computer Use
|
247 |
1. Browser based testing app
|
248 |
2. similar to apps I wrote years ago which would operate a browser then run tests against my web apps including being able to compare any image or text content together to search results from one of my ai programs to determine content overlap which is then used to evaluate the results and update my ai model context data to store anything that was found that adds to the original idea. When I looked at this problem before I found chrome driver for automatic testing, saucelabs which can kind of do it, and then some python testing libraries which could do it. Can you enlighten me on which python libraries and potenitally dev tools which would help me with this to automate my testing and evaluation of my ai generated content which resides at many different URLs on huggingface as running apps
|