Spaces:

apexherbert200
/

playwright-scraper-clean

Running

App Files Files Community

playwright-scraper-clean / test1.py

apexherbert200

Building new logic

e736965 2 months ago

raw

history blame

8.67 kB

	from fastapi import FastAPI, HTTPException, Query
	from pydantic import BaseModel
	from playwright.async_api import async_playwright
	import asyncio
	import base64
	import logging
	from typing import List, Optional
	from urllib.parse import urlparse

	app = FastAPI(
	title="Website Quality & Compliance Analyzer",
	description="API that analyzes websites for SEO, accessibility, compliance and technical quality",
	version="1.0.0"
	)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class SEOResult(BaseModel):
	title: Optional[str] = None
	meta_description: Optional[str] = None
	h1_tags: List[str] = []
	canonical_url: Optional[str] = None
	robots_txt_present: bool = False
	sitemap_present: bool = False

	class AccessibilityResult(BaseModel):
	missing_alt_tags: int = 0
	images_without_alt: List[str] = []
	aria_roles: List[str] = []
	contrast_issues: List[str] = []

	class ComplianceResult(BaseModel):
	has_cookie_banner: bool = False
	gdpr_compliant: Optional[bool] = None
	has_privacy_policy: bool = False
	has_terms_of_service: bool = False

	class TechnicalResult(BaseModel):
	tech_stack: List[str] = []
	viewport_meta: Optional[str] = None
	doctype: Optional[str] = None
	is_https: bool = False
	has_analytics: bool = False

	class BrokenLink(BaseModel):
	url: str
	status: Optional[int] = None
	text: Optional[str] = None

	class AnalysisRequest(BaseModel):
	url: str
	screenshot: bool = False
	mobile_test: bool = False
	check_broken_links: bool = False
	depth: int = 1 # How many levels deep to check broken links

	class AnalysisResponse(BaseModel):
	url: str
	seo: SEOResult
	accessibility: AccessibilityResult
	compliance: ComplianceResult
	technical: TechnicalResult
	broken_links: List[BrokenLink] = []
	mobile_friendly: Optional[bool] = None
	screenshot_base64: Optional[str] = None
	load_time: Optional[float] = None
	success: bool
	error: Optional[str] = None

	async def analyze_page(page, url: str, options: AnalysisRequest):
	result = {
	"url": url,
	"seo": {},
	"accessibility": {},
	"compliance": {},
	"technical": {},
	"broken_links": [],
	"success": True
	}

	# Basic SEO checks
	title = await page.title()
	meta_description = await page.evaluate('''() => {
	const meta = document.querySelector('meta[name="description"]');
	return meta ? meta.content : null;
	}''')

	h1_tags = await page.evaluate('''() => {
	return Array.from(document.querySelectorAll('h1')).map(h => h.textContent.trim());
	}''')

	result["seo"] = {
	"title": title,
	"meta_description": meta_description,
	"h1_tags": h1_tags
	}

	# Accessibility checks
	images_without_alt = await page.evaluate('''() => {
	return Array.from(document.querySelectorAll('img:not([alt])'))
	.map(img => img.src);
	}''')

	result["accessibility"] = {
	"missing_alt_tags": len(images_without_alt),
	"images_without_alt": images_without_alt
	}

	# Compliance checks
	has_cookie_banner = await page.evaluate('''() => {
	const keywords = ['cookie', 'gdpr', 'privacy', 'consent'];
	const elements = document.querySelectorAll('*');
	for (let el of elements) {
	const text = el.textContent.toLowerCase();
	if (keywords.some(kw => text.includes(kw))) {
	return true;
	}
	}
	return false;
	}''')

	result["compliance"] = {
	"has_cookie_banner": has_cookie_banner
	}

	# Technical checks
	tech_stack = []

	# Check for common JS libraries
	libraries = await page.evaluate('''() => {
	const libs = [];
	if (window.jQuery) libs.push('jQuery');
	if (window.React) libs.push('React');
	if (window.Vue) libs.push('Vue');
	if (window.angular) libs.push('Angular');
	return libs;
	}''')

	tech_stack.extend(libraries)

	# Check for analytics
	has_analytics = await page.evaluate('''() => {
	return !!document.querySelector('script[src="google-analytics.com"], script[src="googletagmanager.com"]');
	}''')

	is_https = url.startswith('https://')

	result["technical"] = {
	"tech_stack": tech_stack,
	"is_https": is_https,
	"has_analytics": has_analytics
	}

	# Broken links check (if requested)
	if options.check_broken_links and options.depth > 0:
	links = await page.evaluate('''() => {
	return Array.from(document.querySelectorAll('a[href]')).map(a => ({
	href: a.href,
	text: a.textContent.trim()
	}));
	}''')

	# Filter out external links and non-http links
	domain = urlparse(url).netloc
	internal_links = [
	link for link in links
	if link['href'].startswith('http') and domain in link['href']
	][:10] # Limit to 10 links for demo purposes

	broken_links = []
	for link in internal_links:
	try:
	response = await page.goto(link['href'], wait_until="domcontentloaded")
	status = response.status if response else None
	if status and status >= 400:
	broken_links.append({
	"url": link['href'],
	"status": status,
	"text": link['text']
	})
	except Exception as e:
	broken_links.append({
	"url": link['href'],
	"status": None,
	"text": link['text']
	})

	result["broken_links"] = broken_links

	return result

	@app.post("/analyze", response_model=AnalysisResponse)
	async def analyze_website(request: AnalysisRequest):
	"""Analyze a website for quality and compliance metrics"""
	async with async_playwright() as p:
	try:
	browser = await p.chromium.launch()
	context = await browser.new_context()
	page = await context.new_page()

	# Start timing
	start_time = asyncio.get_event_loop().time()

	# Navigate to the page
	response = await page.goto(request.url, wait_until="domcontentloaded")
	if not response or response.status >= 400:
	raise HTTPException(status_code=400, detail=f"Failed to load page. Status: {response.status if response else 'unknown'}")

	# Mobile test if requested
	if request.mobile_test:
	mobile_viewport = {'width': 375, 'height': 667}
	await page.set_viewport_size(mobile_viewport)
	result = await analyze_page(page, request.url, request)
	result["mobile_friendly"] = True # Basic check - would need more sophisticated testing
	else:
	result = await analyze_page(page, request.url, request)

	# Screenshot if requested
	if request.screenshot:
	screenshot = await page.screenshot(full_page=True)
	result["screenshot_base64"] = base64.b64encode(screenshot).decode('utf-8')

	# Calculate load time
	end_time = asyncio.get_event_loop().time()
	result["load_time"] = end_time - start_time

	await browser.close()
	return result

	except Exception as e:
	logger.error(f"Error analyzing website: {str(e)}")
	await browser.close()
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/analyze", response_model=AnalysisResponse)
	async def analyze_website_get(
	url: str = Query(..., description="URL to analyze"),
	screenshot: bool = Query(False, description="Include screenshot"),
	mobile_test: bool = Query(False, description="Test mobile responsiveness"),
	check_broken_links: bool = Query(False, description="Check for broken links"),
	depth: int = Query(1, description="Depth for broken links check")
	):
	"""GET endpoint for website analysis"""
	request = AnalysisRequest(
	url=url,
	screenshot=screenshot,
	mobile_test=mobile_test,
	check_broken_links=check_broken_links,
	depth=depth
	)
	return await analyze_website(request)

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)