# scrape.py from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import Optional import base64 import json from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError app = FastAPI(title="Web Analyzer API") class ScreenshotResponse(BaseModel): screenshot: str class MetadataResponse(BaseModel): title: Optional[str] description: Optional[str] og: dict twitter: dict canonical: Optional[str] async def get_page(url): pw = await async_playwright().start() browser = await pw.chromium.launch(headless=True) page = await browser.new_page() try: await page.goto(url, timeout=30000) except PlaywrightTimeoutError: raise HTTPException(status_code=504, detail="Page load timed out") return page, browser, pw @app.get("/metadata", response_model=MetadataResponse) async def get_metadata(url: str): page, browser, pw = await get_page(url) try: title = await page.title() # Get description meta tag try: desc = await page.get_attribute("meta[name='description']", "content") except Exception: desc = None # Extract Open Graph metadata og = {} for prop in ["title", "description", "image"]: try: selector = f"meta[property='og:{prop}']" if await page.query_selector(selector): og[f"og:{prop}"] = await page.get_attribute(selector, "content") else: og[f"og:{prop}"] = None except Exception: og[f"og:{prop}"] = None # Extract Twitter metadata twitter = {} for prop in ["title", "description", "image"]: try: selector = f"meta[name='twitter:{prop}']" if await page.query_selector(selector): twitter[f"twitter:{prop}"] = await page.get_attribute(selector, "content") else: twitter[f"twitter:{prop}"] = None except Exception: twitter[f"twitter:{prop}"] = None # Get canonical URL try: canonical = await page.get_attribute("link[rel='canonical']", "href") except Exception: canonical = None return { "title": title, "description": desc, "og": og, "twitter": twitter, "canonical": canonical } finally: await browser.close() await pw.stop() @app.get("/screenshot", response_model=ScreenshotResponse) async def get_screenshot(url: str): page, browser, pw = await get_page(url) try: image_bytes = await page.screenshot(full_page=True) image_base64 = base64.b64encode(image_bytes).decode() return {"screenshot": image_base64} finally: await browser.close() await pw.stop() @app.get("/seo") async def seo_audit(url: str): page, browser, pw = await get_page(url) try: h1_count = await page.locator("h1").count() imgs = await page.query_selector_all("img") missing_alts = [await img.get_attribute("src") for img in imgs if not await img.get_attribute("alt")] anchors = await page.query_selector_all("a[href]") internal, external = 0, 0 for a in anchors: href = await a.get_attribute("href") if href and href.startswith("http"): if url in href: internal += 1 else: external += 1 try: robots = await page.get_attribute("meta[name='robots']", "content") except Exception: robots = None try: canonical = await page.get_attribute("link[rel='canonical']", "href") except Exception: canonical = None return { "h1_count": h1_count, "missing_image_alts": missing_alts, "internal_links": internal, "external_links": external, "robots_meta": robots, "has_canonical": bool(canonical) } finally: await browser.close() await pw.stop() @app.get("/performance") async def performance_metrics(url: str): page, browser, pw = await get_page(url) try: # Get navigation timing try: nav_timing = await page.evaluate("JSON.stringify(performance.getEntriesByType('navigation'))") timing = json.loads(nav_timing)[0] if nav_timing else {} page_load_time = timing.get('duration', None) except Exception: page_load_time = None # Get First Contentful Paint try: fcp = await page.evaluate("performance.getEntriesByName('first-contentful-paint')[0]?.startTime") except Exception: fcp = None # Get Largest Contentful Paint try: lcp = await page.evaluate("performance.getEntriesByType('largest-contentful-paint')[0]?.renderTime") except Exception: lcp = None # Get Cumulative Layout Shift try: cls_entries = await page.evaluate("JSON.stringify(performance.getEntriesByType('layout-shift'))") cls = sum(e.get('value', 0) for e in json.loads(cls_entries) if isinstance(e, dict)) except Exception: cls = None return { "page_load_time_ms": page_load_time, "first_contentful_paint": fcp, "largest_contentful_paint": lcp, "cumulative_layout_shift": cls } finally: await browser.close() await pw.stop() @app.get("/structured-data") async def structured_data(url: str): page, browser, pw = await get_page(url) try: scripts = await page.query_selector_all("script[type='application/ld+json']") json_ld_list = [] for s in scripts: text = await s.inner_text() try: data = json.loads(text) json_ld_list.append(data) except Exception: continue types = [] for obj in json_ld_list: if isinstance(obj, dict) and "@type" in obj: types.append(obj["@type"]) return { "schema_found": bool(json_ld_list), "types": types, "schema": json_ld_list } finally: await browser.close() await pw.stop() @app.get("/accessibility") async def accessibility_check(url: str): page, browser, pw = await get_page(url) try: imgs = await page.query_selector_all("img") missing_alt = len([img for img in imgs if not await img.get_attribute("alt")]) buttons = await page.query_selector_all("button") missing_labels = len([b for b in buttons if not await b.get_attribute("aria-label") and not await b.inner_text()]) landmarks = [] for tag in ["main", "nav", "footer", "header"]: if await page.query_selector(tag): landmarks.append(tag) return { "images_missing_alt": missing_alt, "buttons_missing_label": missing_labels, "landmarks": landmarks } finally: await browser.close() await pw.stop()