Spaces:

apexherbert200
/

playwright-scraper-clean

Paused

App Files Files Community

apexherbert200 commited on Jun 14

Commit

e736965

1 Parent(s): b55a15f

Building new logic

Browse files

Files changed (2) hide show

Dockerfile +1 -1
test1.py +228 -136

Dockerfile CHANGED Viewed

@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
 EXPOSE 7860
 # Run the FastAPI application
-CMD ["python", "-m", "uvicorn", "scrape:app", "--host", "0.0.0.0", "--port", "7860"]

 EXPOSE 7860
 # Run the FastAPI application
+CMD ["python", "-m", "uvicorn", "test1:app", "--host", "0.0.0.0", "--port", "7860"]

test1.py CHANGED Viewed

@@ -5,161 +5,253 @@ import asyncio
 import base64
 import logging
 from typing import List, Optional
-from urllib.parse import urlparse, parse_qs
-# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="Query-Based Web Scraper", description="Scrape websites based on search queries")
-# ... (Keep all your Pydantic models unchanged) ...
-@app.get("/")
-async def root():
-    return {
-        "message": "🚀 Query-Based Web Scraper API",
-        "tagline": "Search and scrape websites based on queries",
-        "endpoints": {
-            "/scrape": "Search Google for the query and scrape the top result",
-            "/docs": "API documentation"
-        },
-        "example": "/scrape?query=plumbers+near+me&lead_generation=true&screenshot=true",
-        "note": "Now accepts search queries instead of direct URLs"
     }
-async def get_top_search_result(query: str):
-    """Perform Google search and return top result URL with CAPTCHA handling"""
-    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
     async with async_playwright() as p:
-        # Use a proxy to avoid CAPTCHAs
-        proxy_server = "us.proxyrack.net:10000"
-        browser = await p.chromium.launch(
-            headless=True,
-            proxy={
-                "server": f"http://{proxy_server}",
-                "username": "your-proxy-username",  # Replace with actual credentials
-                "password": "your-proxy-password"
-            },
-            args=[
-                '--no-sandbox',
-                '--disable-setuid-sandbox',
-                '--disable-dev-shm-usage',
-                '--disable-accelerated-2d-canvas',
-                '--no-first-run',
-                '--no-zygote',
-                '--disable-gpu'
-            ]
-        )
-        context = await browser.new_context(
-            user_agent=user_agent,
-            locale='en-US',
-            viewport={'width': 1920, 'height': 1080},
-            # Bypass automation detection
-            java_script_enabled=True,
-            bypass_csp=True
-        )
-        page = await context.new_page()
         try:
-            logger.info(f"Searching Google for: {query}")
-            await page.goto("https://www.google.com", timeout=60000)
-            # Handle consent form if it appears
-            try:
-                consent_button = await page.wait_for_selector('button:has-text("Accept all"), button:has-text("I agree")', timeout=5000)
-                if consent_button:
-                    await consent_button.click()
-                    logger.info("Accepted Google consent form")
-                    await asyncio.sleep(1)  # Small delay for consent to apply
-            except:
-                pass  # Consent form didn't appear
-            # Perform search
-            search_box = await page.wait_for_selector('textarea[name="q"]', timeout=10000)
-            await search_box.fill(query)
-            await page.keyboard.press("Enter")
-            # Wait for search results - use more reliable method
-            try:
-                # Check if CAPTCHA appeared
-                captcha = await page.query_selector('form#captcha-form, div#recaptcha')
-                if captcha:
-                    logger.error("CAPTCHA encountered during search")
-                    raise Exception("Google CAPTCHA encountered. Cannot proceed with search.")
-                # Wait for search results to appear
-                await page.wait_for_selector('.g, .tF2Cxc', timeout=30000)
-            except:
-                # Try alternative search result container
-                try:
-                    await page.wait_for_selector('#search', timeout=10000)
-                except:
-                    logger.error("Search results not found")
-                    raise Exception("Search results not found")
-            # Extract top results
-            results = await page.query_selector_all('.g, .tF2Cxc')
-            if not results:
-                results = await page.query_selector_all('div[data-snf]')
-            if not results:
-                raise Exception("No search results found")
-            urls = []
-            for result in results[:3]:  # Check top 3 results
-                try:
-                    link = await result.query_selector('a')
-                    if not link:
-                        continue
-                    # Extract both data-href and href attributes
-                    data_href = await link.get_attribute('data-href')
-                    href = await link.get_attribute('href')
-                    target_url = data_href or href
-                    if target_url and target_url.startswith('/url?q='):
-                        target_url = f"https://www.google.com{target_url}"
-                    if target_url and target_url.startswith('https://www.google.com/url?'):
-                        parsed = urlparse(target_url)
-                        qs = parse_qs(parsed.query)
-                        target_url = qs.get('q', [target_url])[0]
-                    if target_url and target_url.startswith('http'):
-                        urls.append(target_url)
-                        logger.info(f"Found search result: {target_url}")
-                except Exception as e:
-                    logger.warning(f"Error processing result: {str(e)}")
-            if not urls:
-                raise Exception("No valid URLs found in search results")
             await browser.close()
-            return urls[0]  # Return top result
         except Exception as e:
-            logger.error(f"Search failed: {str(e)}")
-            await page.screenshot(path="search_error.png")
             await browser.close()
-            raise
-@app.get("/scrape")
-async def scrape_page(
-    query: str = Query(..., description="Search query to find a website"),
-    lead_generation: bool = Query(True, description="Extract lead generation data"),
-    screenshot: bool = Query(True, description="Take a full page screenshot"),
-    get_links: bool = Query(True, description="Extract all links from the page"),
-    get_body: bool = Query(False, description="Extract body tag content")
 ):
-    logger.info(f"Starting scrape for query: {query}")
-    try:
-        # Get top search result URL
-        target_url = await get_top_search_result(query)
-        logger.info(f"Scraping top result: {target_url}")
-    except Exception as e:
-        logger.error(f"Search error: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
-    # ... (keep the rest of the scraping function unchanged) ...

 import base64
 import logging
 from typing import List, Optional
+from urllib.parse import urlparse
+app = FastAPI(
+    title="Website Quality & Compliance Analyzer",
+    description="API that analyzes websites for SEO, accessibility, compliance and technical quality",
+    version="1.0.0"
+)
+# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class SEOResult(BaseModel):
+    title: Optional[str] = None
+    meta_description: Optional[str] = None
+    h1_tags: List[str] = []
+    canonical_url: Optional[str] = None
+    robots_txt_present: bool = False
+    sitemap_present: bool = False
+class AccessibilityResult(BaseModel):
+    missing_alt_tags: int = 0
+    images_without_alt: List[str] = []
+    aria_roles: List[str] = []
+    contrast_issues: List[str] = []
+class ComplianceResult(BaseModel):
+    has_cookie_banner: bool = False
+    gdpr_compliant: Optional[bool] = None
+    has_privacy_policy: bool = False
+    has_terms_of_service: bool = False
+class TechnicalResult(BaseModel):
+    tech_stack: List[str] = []
+    viewport_meta: Optional[str] = None
+    doctype: Optional[str] = None
+    is_https: bool = False
+    has_analytics: bool = False
+class BrokenLink(BaseModel):
+    url: str
+    status: Optional[int] = None
+    text: Optional[str] = None
+class AnalysisRequest(BaseModel):
+    url: str
+    screenshot: bool = False
+    mobile_test: bool = False
+    check_broken_links: bool = False
+    depth: int = 1  # How many levels deep to check broken links
+class AnalysisResponse(BaseModel):
+    url: str
+    seo: SEOResult
+    accessibility: AccessibilityResult
+    compliance: ComplianceResult
+    technical: TechnicalResult
+    broken_links: List[BrokenLink] = []
+    mobile_friendly: Optional[bool] = None
+    screenshot_base64: Optional[str] = None
+    load_time: Optional[float] = None
+    success: bool
+    error: Optional[str] = None
+async def analyze_page(page, url: str, options: AnalysisRequest):
+    result = {
+        "url": url,
+        "seo": {},
+        "accessibility": {},
+        "compliance": {},
+        "technical": {},
+        "broken_links": [],
+        "success": True
+    }
+    # Basic SEO checks
+    title = await page.title()
+    meta_description = await page.evaluate('''() => {
+        const meta = document.querySelector('meta[name="description"]');
+        return meta ? meta.content : null;
+    }''')
+    h1_tags = await page.evaluate('''() => {
+        return Array.from(document.querySelectorAll('h1')).map(h => h.textContent.trim());
+    }''')
+    result["seo"] = {
+        "title": title,
+        "meta_description": meta_description,
+        "h1_tags": h1_tags
     }
+    # Accessibility checks
+    images_without_alt = await page.evaluate('''() => {
+        return Array.from(document.querySelectorAll('img:not([alt])'))
+            .map(img => img.src);
+    }''')
+    result["accessibility"] = {
+        "missing_alt_tags": len(images_without_alt),
+        "images_without_alt": images_without_alt
+    }
+    # Compliance checks
+    has_cookie_banner = await page.evaluate('''() => {
+        const keywords = ['cookie', 'gdpr', 'privacy', 'consent'];
+        const elements = document.querySelectorAll('*');
+        for (let el of elements) {
+            const text = el.textContent.toLowerCase();
+            if (keywords.some(kw => text.includes(kw))) {
+                return true;
+            }
+        }
+        return false;
+    }''')
+    result["compliance"] = {
+        "has_cookie_banner": has_cookie_banner
+    }
+    # Technical checks
+    tech_stack = []
+    # Check for common JS libraries
+    libraries = await page.evaluate('''() => {
+        const libs = [];
+        if (window.jQuery) libs.push('jQuery');
+        if (window.React) libs.push('React');
+        if (window.Vue) libs.push('Vue');
+        if (window.angular) libs.push('Angular');
+        return libs;
+    }''')
+    tech_stack.extend(libraries)
+    # Check for analytics
+    has_analytics = await page.evaluate('''() => {
+        return !!document.querySelector('script[src*="google-analytics.com"], script[src*="googletagmanager.com"]');
+    }''')
+    is_https = url.startswith('https://')
+    result["technical"] = {
+        "tech_stack": tech_stack,
+        "is_https": is_https,
+        "has_analytics": has_analytics
+    }
+    # Broken links check (if requested)
+    if options.check_broken_links and options.depth > 0:
+        links = await page.evaluate('''() => {
+            return Array.from(document.querySelectorAll('a[href]')).map(a => ({
+                href: a.href,
+                text: a.textContent.trim()
+            }));
+        }''')
+        # Filter out external links and non-http links
+        domain = urlparse(url).netloc
+        internal_links = [
+            link for link in links
+            if link['href'].startswith('http') and domain in link['href']
+        ][:10]  # Limit to 10 links for demo purposes
+        broken_links = []
+        for link in internal_links:
+            try:
+                response = await page.goto(link['href'], wait_until="domcontentloaded")
+                status = response.status if response else None
+                if status and status >= 400:
+                    broken_links.append({
+                        "url": link['href'],
+                        "status": status,
+                        "text": link['text']
+                    })
+            except Exception as e:
+                broken_links.append({
+                    "url": link['href'],
+                    "status": None,
+                    "text": link['text']
+                })
+        result["broken_links"] = broken_links
+    return result
+@app.post("/analyze", response_model=AnalysisResponse)
+async def analyze_website(request: AnalysisRequest):
+    """Analyze a website for quality and compliance metrics"""
     async with async_playwright() as p:
         try:
+            browser = await p.chromium.launch()
+            context = await browser.new_context()
+            page = await context.new_page()
+            # Start timing
+            start_time = asyncio.get_event_loop().time()
+            # Navigate to the page
+            response = await page.goto(request.url, wait_until="domcontentloaded")
+            if not response or response.status >= 400:
+                raise HTTPException(status_code=400, detail=f"Failed to load page. Status: {response.status if response else 'unknown'}")
+            # Mobile test if requested
+            if request.mobile_test:
+                mobile_viewport = {'width': 375, 'height': 667}
+                await page.set_viewport_size(mobile_viewport)
+                result = await analyze_page(page, request.url, request)
+                result["mobile_friendly"] = True  # Basic check - would need more sophisticated testing
+            else:
+                result = await analyze_page(page, request.url, request)
+            # Screenshot if requested
+            if request.screenshot:
+                screenshot = await page.screenshot(full_page=True)
+                result["screenshot_base64"] = base64.b64encode(screenshot).decode('utf-8')
+            # Calculate load time
+            end_time = asyncio.get_event_loop().time()
+            result["load_time"] = end_time - start_time
             await browser.close()
+            return result
         except Exception as e:
+            logger.error(f"Error analyzing website: {str(e)}")
             await browser.close()
+            raise HTTPException(status_code=500, detail=str(e))
+@app.get("/analyze", response_model=AnalysisResponse)
+async def analyze_website_get(
+    url: str = Query(..., description="URL to analyze"),
+    screenshot: bool = Query(False, description="Include screenshot"),
+    mobile_test: bool = Query(False, description="Test mobile responsiveness"),
+    check_broken_links: bool = Query(False, description="Check for broken links"),
+    depth: int = Query(1, description="Depth for broken links check")
 ):
+    """GET endpoint for website analysis"""
+    request = AnalysisRequest(
+        url=url,
+        screenshot=screenshot,
+        mobile_test=mobile_test,
+        check_broken_links=check_broken_links,
+        depth=depth
+    )
+    return await analyze_website(request)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)