File size: 14,048 Bytes
579471f
f19e94e
 
 
 
 
 
 
 
 
3386078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579471f
f19e94e
 
 
 
 
 
 
 
 
 
3386078
 
 
 
 
 
 
 
 
f19e94e
3386078
 
 
 
f19e94e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3386078
f19e94e
 
 
 
3386078
 
 
 
f19e94e
3386078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f19e94e
3386078
 
 
f19e94e
 
 
 
 
 
3386078
 
 
 
f19e94e
 
 
 
 
 
 
 
 
 
 
 
 
3386078
 
f19e94e
3386078
f19e94e
3386078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f19e94e
 
 
 
 
 
 
 
3386078
f19e94e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3386078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f19e94e
 
 
 
3386078
579471f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import io
import base64
from urllib.parse import urljoin, urlparse
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
import asyncio
from playwright.sync_api import sync_playwright
import sys
import subprocess

def install_playwright_deps():
    try:
        from playwright.sync_api import sync_playwright
        # Install browsers if not already installed
        subprocess.run(['playwright', 'install'], check=True)
    except Exception as e:
        st.error(f"Error installing Playwright dependencies: {str(e)}")
        st.info("Try running 'pip install playwright' and 'playwright install' manually")

def initialize_session_state():
    if 'visited_urls' not in st.session_state:
        st.session_state.visited_urls = []
    if 'load_times' not in st.session_state:
        st.session_state.load_times = []
    if 'screenshots' not in st.session_state:
        st.session_state.screenshots = []
    if 'crawl_results' not in st.session_state:
        st.session_state.crawl_results = []

def setup_browser():
    """Initialize Playwright browser"""
    try:
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(headless=True)
        return playwright, browser
    except Exception as e:
        st.error(f"Error setting up browser: {str(e)}")
        return None, None

def capture_screenshot(page):
    """Capture screenshot using Playwright"""
    screenshot_bytes = page.screenshot()
    return Image.open(io.BytesIO(screenshot_bytes))

def calculate_similarity(text1, text2):
    # Basic similarity
    basic_ratio = SequenceMatcher(None, text1, text2).ratio()
    
    # Semantic similarity
    vectorizer = TfidfVectorizer()
    try:
        tfidf = vectorizer.fit_transform([text1, text2])
        semantic_ratio = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
    except:
        semantic_ratio = 0
    
    return basic_ratio, semantic_ratio

async def crawl_website(url, max_pages=10, search_term=None):
    visited = set()
    to_visit = {url}
    results = []
    
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()
            
            while to_visit and len(visited) < max_pages:
                current_url = to_visit.pop()
                if current_url in visited:
                    continue
                    
                try:
                    page.goto(current_url, wait_until="networkidle")
                    visited.add(current_url)
                    
                    # Extract text content
                    text_content = page.content()
                    
                    # If search term provided, check for matches
                    match_found = search_term.lower() in text_content.lower() if search_term else True
                    
                    if match_found:
                        results.append({
                            'url': current_url,
                            'title': page.title(),
                            'content_preview': text_content[:200],
                            'matches_search': match_found
                        })
                    
                    # Find new links
                    links = page.eval_on_selector_all('a[href]', 'elements => elements.map(el => el.href)')
                    for href in links:
                        absolute_url = urljoin(current_url, href)
                        if urlparse(absolute_url).netloc == urlparse(url).netloc:
                            to_visit.add(absolute_url)
                            
                except Exception as e:
                    st.error(f"Error crawling {current_url}: {str(e)}")
                    
            browser.close()
            
    except Exception as e:
        st.error(f"Error in crawl process: {str(e)}")
        
    return results

def main():
    st.title("Web Testing and Crawling Suite")
    initialize_session_state()
    
    # Install dependencies if needed
    with st.spinner("Checking dependencies..."):
        install_playwright_deps()
    
    # Sidebar for tool selection
    tool = st.sidebar.radio(
        "Select Tool",
        ["WebTest", "Crawler", "AI Content Comparison"]
    )
    
    if tool == "WebTest":
        st.header("WebTest - Web Performance Testing")
        url = st.text_input("Enter URL to test")
        interval = st.slider("Time interval between requests (seconds)", 1, 30, 5)
        max_cycles = st.number_input("Number of test cycles", 1, 100, 1)
        
        if st.button("Start Testing"):
            playwright, browser = setup_browser()
            if playwright and browser:
                try:
                    page = browser.new_page()
                    
                    for cycle in range(max_cycles):
                        start_time = time.time()
                        
                        try:
                            page.goto(url, wait_until="networkidle")
                            load_time = time.time() - start_time
                            st.session_state.load_times.append(load_time)
                            
                            # Capture screenshot
                            screenshot = capture_screenshot(page)
                            st.session_state.screenshots.append(screenshot)
                            
                            # Show results
                            st.success(f"Cycle {cycle + 1} completed - Load time: {load_time:.2f}s")
                            st.image(screenshot, caption=f"Screenshot - Cycle {cycle + 1}")
                            
                            # Plot load times
                            fig = go.Figure(data=go.Scatter(
                                x=list(range(1, len(st.session_state.load_times) + 1)),
                                y=st.session_state.load_times,
                                mode='lines+markers'
                            ))
                            fig.update_layout(title="Page Load Times", 
                                            xaxis_title="Cycle",
                                            yaxis_title="Load Time (s)")
                            st.plotly_chart(fig)
                            
                            time.sleep(interval)
                            
                        except Exception as e:
                            st.error(f"Error in cycle {cycle + 1}: {str(e)}")
                            
                finally:
                    browser.close()
                    playwright.stop()
            
    elif tool == "Crawler":
        st.header("Web Crawler")
        base_url = st.text_input("Enter base URL to crawl")
        max_pages = st.number_input("Maximum pages to crawl", 1, 100, 10)
        search_term = st.text_input("Search term (optional)")
        
        if st.button("Start Crawling"):
            results = asyncio.run(crawl_website(base_url, max_pages, search_term))
            st.session_state.crawl_results = results
            
            # Display results
            df = pd.DataFrame(results)
            st.dataframe(df)
            
            # Export options
            if st.button("Export Results"):
                csv = df.to_csv(index=False)
                b64 = base64.b64encode(csv.encode()).decode()
                href = f'<a href="data:file/csv;base64,{b64}" download="crawl_results.csv">Download CSV</a>'
                st.markdown(href, unsafe_allow_html=True)
                
    else:  # AI Content Comparison
        st.header("AI Content Comparison")
        url1 = st.text_input("Enter first URL (AI-generated content)")
        url2 = st.text_input("Enter second URL (Comparison content)")
        
        if st.button("Compare Content"):
            playwright, browser = setup_browser()
            if playwright and browser:
                try:
                    page = browser.new_page()
                    
                    # Get content from first URL
                    page.goto(url1, wait_until="networkidle")
                    content1 = page.content()
                    
                    # Get content from second URL
                    page.goto(url2, wait_until="networkidle")
                    content2 = page.content()
                    
                    # Calculate similarities
                    basic_ratio, semantic_ratio = calculate_similarity(content1, content2)
                    
                    # Display results
                    st.subheader("Similarity Results")
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.metric("Basic Similarity", f"{basic_ratio:.2%}")
                    
                    with col2:
                        st.metric("Semantic Similarity", f"{semantic_ratio:.2%}")
                    
                    # Show content previews
                    st.subheader("Content Previews")
                    st.text_area("Content 1 (First 500 chars)", content1[:500])
                    st.text_area("Content 2 (First 500 chars)", content2[:500])
                    
                finally:
                    browser.close()
                    playwright.stop()

if __name__ == "__main__":
    main()
    
Goals="""
Computer Use
1. Browser based testing app
2. similar to apps I wrote years ago which would operate a browser then run tests against my web apps including being able to compare any image or text content together to search results from one of my ai programs to determine content overlap which is then used to evaluate the results and update my ai model context data to store anything that was found that adds to the original idea.  When I looked at this problem before I found chrome driver for automatic testing, saucelabs which can kind of do it, and then some python testing libraries which could do it.  Can you enlighten me on which python libraries and potenitally dev tools which would help me with this to automate my testing and evaluation of my ai generated content which resides at many different URLs on huggingface as running apps
3. Past apps per wayback from 2004:  
    - https://web.archive.org/web/20040520102150/http://www.evolvable.com/EStore/

WebTest 8.0
WebTest is a stress and load testing browser.
You can use WebTest to identify defects that occur when web sites incurr a large amount of traffic.
To use WebTest, simply visit the pages that you want to test and WebTest remembers your navigation history. You can save the history to a text file that you can open from other machines or other copies of WebTest running on your computer. To perform stress testing, set the time interval you would like WebTest to visit each Url, then set the cycle option to cycle through your site list.

Features Include:
Screen Captures
Graph of Page Load Times
Memory of Visited Pages
Tunable Delay and Maximum Wait Time
Hotkeys for Typical Browser Resolution Sizing
XML/XSL based Usability Reporting

Crawler 1.0

Have you ever wanted to analyze a web site to see what it contains yet didn't want to spend the next few days traversing all the links on the site?

With Crawler, you can enter a web address and Crawler visits every page, extracting a reusable list of links from the pages it reviews.
You can save the HTML source code from each page that Crawler encounters so you can analyze the site further, or automatically surf the site and take snapshots using Evolvable WebTest.

Crawler is an indespensable tool because it provides results in an easy to use format and also allows you to search for terms across different sites and save the list of links that match your search criteria.

Let me help you explore the Python libraries and tools that would be well-suited for automated browser testing and content comparison for AI-generated content.
The most popular and robust options for this type of testing are:

Selenium with Python

pythonCopyfrom selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://huggingface.co/your-app")
content = driver.find_element(By.CLASS_NAME, "content").text

Playwright (more modern alternative to Selenium)

pythonCopyfrom playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()
    page.goto("https://huggingface.co/your-app")
    content = page.inner_text(".content")

For content comparison and similarity analysis:

pythonCopyfrom difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Basic text similarity
def similarity_ratio(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()

# More advanced semantic similarity
def semantic_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
For image comparison:
pythonCopyfrom PIL import Image
import imagehash
import numpy as np

def compare_images(image1_path, image2_path):
    hash1 = imagehash.average_hash(Image.open(image1_path))
    hash2 = imagehash.average_hash(Image.open(image2_path))
    return hash1 - hash2  # Lower number means more similar
Would you like me to create a more comprehensive example that combines these elements into a complete testing framework? I could show you how to:

Automate browsing multiple HuggingFace URLs
Extract both text and image content
Compare content using different similarity metrics
Store results in a structured format for model context updates

"""

st.markdown(Goals)