Spaces:

apexherbert200
/

playwright-scraper-clean

Running

File size: 8,669 Bytes

from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from playwright.async_api import async_playwright
import asyncio
import base64
import logging
from typing import List, Optional
from urllib.parse import urlparse

app = FastAPI(
    title="Website Quality & Compliance Analyzer",
    description="API that analyzes websites for SEO, accessibility, compliance and technical quality",
    version="1.0.0"
)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SEOResult(BaseModel):
    title: Optional[str] = None
    meta_description: Optional[str] = None
    h1_tags: List[str] = []
    canonical_url: Optional[str] = None
    robots_txt_present: bool = False
    sitemap_present: bool = False

class AccessibilityResult(BaseModel):
    missing_alt_tags: int = 0
    images_without_alt: List[str] = []
    aria_roles: List[str] = []
    contrast_issues: List[str] = []

class ComplianceResult(BaseModel):
    has_cookie_banner: bool = False
    gdpr_compliant: Optional[bool] = None
    has_privacy_policy: bool = False
    has_terms_of_service: bool = False

class TechnicalResult(BaseModel):
    tech_stack: List[str] = []
    viewport_meta: Optional[str] = None
    doctype: Optional[str] = None
    is_https: bool = False
    has_analytics: bool = False

class BrokenLink(BaseModel):
    url: str
    status: Optional[int] = None
    text: Optional[str] = None

class AnalysisRequest(BaseModel):
    url: str
    screenshot: bool = False
    mobile_test: bool = False
    check_broken_links: bool = False
    depth: int = 1  # How many levels deep to check broken links

class AnalysisResponse(BaseModel):
    url: str
    seo: SEOResult
    accessibility: AccessibilityResult
    compliance: ComplianceResult
    technical: TechnicalResult
    broken_links: List[BrokenLink] = []
    mobile_friendly: Optional[bool] = None
    screenshot_base64: Optional[str] = None
    load_time: Optional[float] = None
    success: bool
    error: Optional[str] = None

async def analyze_page(page, url: str, options: AnalysisRequest):
    result = {
        "url": url,
        "seo": {},
        "accessibility": {},
        "compliance": {},
        "technical": {},
        "broken_links": [],
        "success": True
    }
    
    # Basic SEO checks
    title = await page.title()
    meta_description = await page.evaluate('''() => {
        const meta = document.querySelector('meta[name="description"]');
        return meta ? meta.content : null;
    }''')
    
    h1_tags = await page.evaluate('''() => {
        return Array.from(document.querySelectorAll('h1')).map(h => h.textContent.trim());
    }''')
    
    result["seo"] = {
        "title": title,
        "meta_description": meta_description,
        "h1_tags": h1_tags
    }
    
    # Accessibility checks
    images_without_alt = await page.evaluate('''() => {
        return Array.from(document.querySelectorAll('img:not([alt])'))
            .map(img => img.src);
    }''')
    
    result["accessibility"] = {
        "missing_alt_tags": len(images_without_alt),
        "images_without_alt": images_without_alt
    }
    
    # Compliance checks
    has_cookie_banner = await page.evaluate('''() => {
        const keywords = ['cookie', 'gdpr', 'privacy', 'consent'];
        const elements = document.querySelectorAll('*');
        for (let el of elements) {
            const text = el.textContent.toLowerCase();
            if (keywords.some(kw => text.includes(kw))) {
                return true;
            }
        }
        return false;
    }''')
    
    result["compliance"] = {
        "has_cookie_banner": has_cookie_banner
    }
    
    # Technical checks
    tech_stack = []
    
    # Check for common JS libraries
    libraries = await page.evaluate('''() => {
        const libs = [];
        if (window.jQuery) libs.push('jQuery');
        if (window.React) libs.push('React');
        if (window.Vue) libs.push('Vue');
        if (window.angular) libs.push('Angular');
        return libs;
    }''')
    
    tech_stack.extend(libraries)
    
    # Check for analytics
    has_analytics = await page.evaluate('''() => {
        return !!document.querySelector('script[src*="google-analytics.com"], script[src*="googletagmanager.com"]');
    }''')
    
    is_https = url.startswith('https://')
    
    result["technical"] = {
        "tech_stack": tech_stack,
        "is_https": is_https,
        "has_analytics": has_analytics
    }
    
    # Broken links check (if requested)
    if options.check_broken_links and options.depth > 0:
        links = await page.evaluate('''() => {
            return Array.from(document.querySelectorAll('a[href]')).map(a => ({
                href: a.href,
                text: a.textContent.trim()
            }));
        }''')
        
        # Filter out external links and non-http links
        domain = urlparse(url).netloc
        internal_links = [
            link for link in links 
            if link['href'].startswith('http') and domain in link['href']
        ][:10]  # Limit to 10 links for demo purposes
        
        broken_links = []
        for link in internal_links:
            try:
                response = await page.goto(link['href'], wait_until="domcontentloaded")
                status = response.status if response else None
                if status and status >= 400:
                    broken_links.append({
                        "url": link['href'],
                        "status": status,
                        "text": link['text']
                    })
            except Exception as e:
                broken_links.append({
                    "url": link['href'],
                    "status": None,
                    "text": link['text']
                })
        
        result["broken_links"] = broken_links
    
    return result

@app.post("/analyze", response_model=AnalysisResponse)
async def analyze_website(request: AnalysisRequest):
    """Analyze a website for quality and compliance metrics"""
    async with async_playwright() as p:
        try:
            browser = await p.chromium.launch()
            context = await browser.new_context()
            page = await context.new_page()
            
            # Start timing
            start_time = asyncio.get_event_loop().time()
            
            # Navigate to the page
            response = await page.goto(request.url, wait_until="domcontentloaded")
            if not response or response.status >= 400:
                raise HTTPException(status_code=400, detail=f"Failed to load page. Status: {response.status if response else 'unknown'}")
            
            # Mobile test if requested
            if request.mobile_test:
                mobile_viewport = {'width': 375, 'height': 667}
                await page.set_viewport_size(mobile_viewport)
                result = await analyze_page(page, request.url, request)
                result["mobile_friendly"] = True  # Basic check - would need more sophisticated testing
            else:
                result = await analyze_page(page, request.url, request)
            
            # Screenshot if requested
            if request.screenshot:
                screenshot = await page.screenshot(full_page=True)
                result["screenshot_base64"] = base64.b64encode(screenshot).decode('utf-8')
            
            # Calculate load time
            end_time = asyncio.get_event_loop().time()
            result["load_time"] = end_time - start_time
            
            await browser.close()
            return result
            
        except Exception as e:
            logger.error(f"Error analyzing website: {str(e)}")
            await browser.close()
            raise HTTPException(status_code=500, detail=str(e))

@app.get("/analyze", response_model=AnalysisResponse)
async def analyze_website_get(
    url: str = Query(..., description="URL to analyze"),
    screenshot: bool = Query(False, description="Include screenshot"),
    mobile_test: bool = Query(False, description="Test mobile responsiveness"),
    check_broken_links: bool = Query(False, description="Check for broken links"),
    depth: int = Query(1, description="Depth for broken links check")
):
    """GET endpoint for website analysis"""
    request = AnalysisRequest(
        url=url,
        screenshot=screenshot,
        mobile_test=mobile_test,
        check_broken_links=check_broken_links,
        depth=depth
    )
    return await analyze_website(request)

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)