Spaces:

awacke1
/

ComputerUseSeleniumPlaywrightDifflibSKLearnImagehash

Sleeping

App Files Files Community

awacke1 commited on Nov 13, 2024

Commit

3386078

verified ·

1 Parent(s): 11dd3f0

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -121

app.py CHANGED Viewed

@@ -1,21 +1,30 @@
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
 from PIL import Image
-import imagehash
-from difflib import SequenceMatcher
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import time
 import io
 import base64
 from urllib.parse import urljoin, urlparse
 import pandas as pd
 import plotly.graph_objects as go
 import numpy as np
 def initialize_session_state():
     if 'visited_urls' not in st.session_state:
@@ -27,16 +36,20 @@ def initialize_session_state():
     if 'crawl_results' not in st.session_state:
         st.session_state.crawl_results = []
-def setup_chrome_driver():
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")
-    chrome_options.add_argument("--no-sandbox")
-    chrome_options.add_argument("--disable-dev-shm-usage")
-    return webdriver.Chrome(options=chrome_options)
-def capture_screenshot(driver):
-    screenshot = driver.get_screenshot_as_png()
-    return Image.open(io.BytesIO(screenshot))
 def calculate_similarity(text1, text2):
     # Basic similarity
@@ -52,52 +65,64 @@ def calculate_similarity(text1, text2):
     return basic_ratio, semantic_ratio
-def crawl_website(url, max_pages=10, search_term=None):
     visited = set()
     to_visit = {url}
     results = []
-    while to_visit and len(visited) < max_pages:
-        current_url = to_visit.pop()
-        if current_url in visited:
-            continue
-        try:
-            response = requests.get(current_url)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            visited.add(current_url)
-            # Extract text content
-            text_content = soup.get_text()
-            # If search term provided, check for matches
-            match_found = search_term.lower() in text_content.lower() if search_term else True
-            if match_found:
-                results.append({
-                    'url': current_url,
-                    'title': soup.title.string if soup.title else 'No title',
-                    'content_preview': text_content[:200],
-                    'matches_search': match_found
-                })
-            # Find new links
-            for link in soup.find_all('a'):
-                href = link.get('href')
-                if href:
-                    absolute_url = urljoin(current_url, href)
-                    if urlparse(absolute_url).netloc == urlparse(url).netloc:
-                        to_visit.add(absolute_url)
-        except Exception as e:
-            st.error(f"Error crawling {current_url}: {str(e)}")
     return results
 def main():
     st.title("Web Testing and Crawling Suite")
     initialize_session_state()
     # Sidebar for tool selection
     tool = st.sidebar.radio(
         "Select Tool",
@@ -111,41 +136,46 @@ def main():
         max_cycles = st.number_input("Number of test cycles", 1, 100, 1)
         if st.button("Start Testing"):
-            driver = setup_chrome_driver()
-            for cycle in range(max_cycles):
-                start_time = time.time()
                 try:
-                    driver.get(url)
-                    load_time = time.time() - start_time
-                    st.session_state.load_times.append(load_time)
-                    # Capture screenshot
-                    screenshot = capture_screenshot(driver)
-                    st.session_state.screenshots.append(screenshot)
-                    # Show results
-                    st.success(f"Cycle {cycle + 1} completed - Load time: {load_time:.2f}s")
-                    st.image(screenshot, caption=f"Screenshot - Cycle {cycle + 1}")
-                    # Plot load times
-                    fig = go.Figure(data=go.Scatter(
-                        x=list(range(1, len(st.session_state.load_times) + 1)),
-                        y=st.session_state.load_times,
-                        mode='lines+markers'
-                    ))
-                    fig.update_layout(title="Page Load Times",
-                                    xaxis_title="Cycle",
-                                    yaxis_title="Load Time (s)")
-                    st.plotly_chart(fig)
-                    time.sleep(interval)
-                except Exception as e:
-                    st.error(f"Error in cycle {cycle + 1}: {str(e)}")
-            driver.quit()
     elif tool == "Crawler":
         st.header("Web Crawler")
@@ -154,7 +184,7 @@ def main():
         search_term = st.text_input("Search term (optional)")
         if st.button("Start Crawling"):
-            results = crawl_website(base_url, max_pages, search_term)
             st.session_state.crawl_results = results
             # Display results
@@ -174,46 +204,45 @@ def main():
         url2 = st.text_input("Enter second URL (Comparison content)")
         if st.button("Compare Content"):
-            driver = setup_chrome_driver()
-            try:
-                # Get content from first URL
-                driver.get(url1)
-                content1 = driver.find_element(By.TAG_NAME, "body").text
-                # Get content from second URL
-                driver.get(url2)
-                content2 = driver.find_element(By.TAG_NAME, "body").text
-                # Calculate similarities
-                basic_ratio, semantic_ratio = calculate_similarity(content1, content2)
-                # Display results
-                st.subheader("Similarity Results")
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.metric("Basic Similarity", f"{basic_ratio:.2%}")
-                with col2:
-                    st.metric("Semantic Similarity", f"{semantic_ratio:.2%}")
-                # Show content previews
-                st.subheader("Content Previews")
-                st.text_area("Content 1 (First 500 chars)", content1[:500])
-                st.text_area("Content 2 (First 500 chars)", content2[:500])
-            except Exception as e:
-                st.error(f"Error comparing content: {str(e)}")
-            finally:
-                driver.quit()
 if __name__ == "__main__":
     main()
-AppGoals="""
 Computer Use
 1. Browser based testing app
 2. similar to apps I wrote years ago which would operate a browser then run tests against my web apps including being able to compare any image or text content together to search results from one of my ai programs to determine content overlap which is then used to evaluate the results and update my ai model context data to store anything that was found that adds to the original idea.  When I looked at this problem before I found chrome driver for automatic testing, saucelabs which can kind of do it, and then some python testing libraries which could do it.  Can you enlighten me on which python libraries and potenitally dev tools which would help me with this to automate my testing and evaluation of my ai generated content which resides at many different URLs on huggingface as running apps

 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
 from PIL import Image
 import io
 import base64
 from urllib.parse import urljoin, urlparse
 import pandas as pd
 import plotly.graph_objects as go
 import numpy as np
+from difflib import SequenceMatcher
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import time
+import asyncio
+from playwright.sync_api import sync_playwright
+import sys
+import subprocess
+def install_playwright_deps():
+    try:
+        from playwright.sync_api import sync_playwright
+        # Install browsers if not already installed
+        subprocess.run(['playwright', 'install'], check=True)
+    except Exception as e:
+        st.error(f"Error installing Playwright dependencies: {str(e)}")
+        st.info("Try running 'pip install playwright' and 'playwright install' manually")
 def initialize_session_state():
     if 'visited_urls' not in st.session_state:
     if 'crawl_results' not in st.session_state:
         st.session_state.crawl_results = []
+def setup_browser():
+    """Initialize Playwright browser"""
+    try:
+        playwright = sync_playwright().start()
+        browser = playwright.chromium.launch(headless=True)
+        return playwright, browser
+    except Exception as e:
+        st.error(f"Error setting up browser: {str(e)}")
+        return None, None
+def capture_screenshot(page):
+    """Capture screenshot using Playwright"""
+    screenshot_bytes = page.screenshot()
+    return Image.open(io.BytesIO(screenshot_bytes))
 def calculate_similarity(text1, text2):
     # Basic similarity
     return basic_ratio, semantic_ratio
+async def crawl_website(url, max_pages=10, search_term=None):
     visited = set()
     to_visit = {url}
     results = []
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page()
+            while to_visit and len(visited) < max_pages:
+                current_url = to_visit.pop()
+                if current_url in visited:
+                    continue
+                try:
+                    page.goto(current_url, wait_until="networkidle")
+                    visited.add(current_url)
+                    # Extract text content
+                    text_content = page.content()
+                    # If search term provided, check for matches
+                    match_found = search_term.lower() in text_content.lower() if search_term else True
+                    if match_found:
+                        results.append({
+                            'url': current_url,
+                            'title': page.title(),
+                            'content_preview': text_content[:200],
+                            'matches_search': match_found
+                        })
+                    # Find new links
+                    links = page.eval_on_selector_all('a[href]', 'elements => elements.map(el => el.href)')
+                    for href in links:
+                        absolute_url = urljoin(current_url, href)
+                        if urlparse(absolute_url).netloc == urlparse(url).netloc:
+                            to_visit.add(absolute_url)
+                except Exception as e:
+                    st.error(f"Error crawling {current_url}: {str(e)}")
+            browser.close()
+    except Exception as e:
+        st.error(f"Error in crawl process: {str(e)}")
     return results
 def main():
     st.title("Web Testing and Crawling Suite")
     initialize_session_state()
+    # Install dependencies if needed
+    with st.spinner("Checking dependencies..."):
+        install_playwright_deps()
     # Sidebar for tool selection
     tool = st.sidebar.radio(
         "Select Tool",
         max_cycles = st.number_input("Number of test cycles", 1, 100, 1)
         if st.button("Start Testing"):
+            playwright, browser = setup_browser()
+            if playwright and browser:
                 try:
+                    page = browser.new_page()
+                    for cycle in range(max_cycles):
+                        start_time = time.time()
+                        try:
+                            page.goto(url, wait_until="networkidle")
+                            load_time = time.time() - start_time
+                            st.session_state.load_times.append(load_time)
+                            # Capture screenshot
+                            screenshot = capture_screenshot(page)
+                            st.session_state.screenshots.append(screenshot)
+                            # Show results
+                            st.success(f"Cycle {cycle + 1} completed - Load time: {load_time:.2f}s")
+                            st.image(screenshot, caption=f"Screenshot - Cycle {cycle + 1}")
+                            # Plot load times
+                            fig = go.Figure(data=go.Scatter(
+                                x=list(range(1, len(st.session_state.load_times) + 1)),
+                                y=st.session_state.load_times,
+                                mode='lines+markers'
+                            ))
+                            fig.update_layout(title="Page Load Times",
+                                            xaxis_title="Cycle",
+                                            yaxis_title="Load Time (s)")
+                            st.plotly_chart(fig)
+                            time.sleep(interval)
+                        except Exception as e:
+                            st.error(f"Error in cycle {cycle + 1}: {str(e)}")
+                finally:
+                    browser.close()
+                    playwright.stop()
     elif tool == "Crawler":
         st.header("Web Crawler")
         search_term = st.text_input("Search term (optional)")
         if st.button("Start Crawling"):
+            results = asyncio.run(crawl_website(base_url, max_pages, search_term))
             st.session_state.crawl_results = results
             # Display results
         url2 = st.text_input("Enter second URL (Comparison content)")
         if st.button("Compare Content"):
+            playwright, browser = setup_browser()
+            if playwright and browser:
+                try:
+                    page = browser.new_page()
+                    # Get content from first URL
+                    page.goto(url1, wait_until="networkidle")
+                    content1 = page.content()
+                    # Get content from second URL
+                    page.goto(url2, wait_until="networkidle")
+                    content2 = page.content()
+                    # Calculate similarities
+                    basic_ratio, semantic_ratio = calculate_similarity(content1, content2)
+                    # Display results
+                    st.subheader("Similarity Results")
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.metric("Basic Similarity", f"{basic_ratio:.2%}")
+                    with col2:
+                        st.metric("Semantic Similarity", f"{semantic_ratio:.2%}")
+                    # Show content previews
+                    st.subheader("Content Previews")
+                    st.text_area("Content 1 (First 500 chars)", content1[:500])
+                    st.text_area("Content 2 (First 500 chars)", content2[:500])
+                finally:
+                    browser.close()
+                    playwright.stop()
 if __name__ == "__main__":
     main()
+Goals="""
 Computer Use
 1. Browser based testing app
 2. similar to apps I wrote years ago which would operate a browser then run tests against my web apps including being able to compare any image or text content together to search results from one of my ai programs to determine content overlap which is then used to evaluate the results and update my ai model context data to store anything that was found that adds to the original idea.  When I looked at this problem before I found chrome driver for automatic testing, saucelabs which can kind of do it, and then some python testing libraries which could do it.  Can you enlighten me on which python libraries and potenitally dev tools which would help me with this to automate my testing and evaluation of my ai generated content which resides at many different URLs on huggingface as running apps