Spaces:

apexherbert200
/

playwright-scraper-clean

Running

App Files Files Community

playwright-scraper-clean / screenshot.py

apexherbert200

Working /experimenting

f089137 2 months ago

raw

history blame

7.36 kB

	# scrape.py
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from typing import Optional
	import base64
	import json
	from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

	app = FastAPI(title="Web Analyzer API")


	class ScreenshotResponse(BaseModel):
	screenshot: str

	class MetadataResponse(BaseModel):
	title: Optional[str]
	description: Optional[str]
	og: dict
	twitter: dict
	canonical: Optional[str]


	async def get_page(url):
	pw = await async_playwright().start()
	browser = await pw.chromium.launch(headless=True)
	page = await browser.new_page()
	try:
	await page.goto(url, timeout=30000)
	except PlaywrightTimeoutError:
	raise HTTPException(status_code=504, detail="Page load timed out")
	return page, browser, pw


	@app.get("/metadata", response_model=MetadataResponse)
	async def get_metadata(url: str):
	page, browser, pw = await get_page(url)
	try:
	title = await page.title()

	# Get description meta tag
	try:
	desc = await page.get_attribute("meta[name='description']", "content")
	except Exception:
	desc = None

	# Extract Open Graph metadata
	og = {}
	for prop in ["title", "description", "image"]:
	try:
	selector = f"meta[property='og:{prop}']"
	if await page.query_selector(selector):
	og[f"og:{prop}"] = await page.get_attribute(selector, "content")
	else:
	og[f"og:{prop}"] = None
	except Exception:
	og[f"og:{prop}"] = None

	# Extract Twitter metadata
	twitter = {}
	for prop in ["title", "description", "image"]:
	try:
	selector = f"meta[name='twitter:{prop}']"
	if await page.query_selector(selector):
	twitter[f"twitter:{prop}"] = await page.get_attribute(selector, "content")
	else:
	twitter[f"twitter:{prop}"] = None
	except Exception:
	twitter[f"twitter:{prop}"] = None

	# Get canonical URL
	try:
	canonical = await page.get_attribute("link[rel='canonical']", "href")
	except Exception:
	canonical = None
	return {
	"title": title,
	"description": desc,
	"og": og,
	"twitter": twitter,
	"canonical": canonical
	}
	finally:
	await browser.close()
	await pw.stop()


	@app.get("/screenshot", response_model=ScreenshotResponse)
	async def get_screenshot(url: str):
	page, browser, pw = await get_page(url)
	try:
	image_bytes = await page.screenshot(full_page=True)
	image_base64 = base64.b64encode(image_bytes).decode()
	return {"screenshot": image_base64}
	finally:
	await browser.close()
	await pw.stop()


	@app.get("/seo")
	async def seo_audit(url: str):
	page, browser, pw = await get_page(url)
	try:
	h1_count = await page.locator("h1").count()
	imgs = await page.query_selector_all("img")
	missing_alts = [await img.get_attribute("src") for img in imgs if not await img.get_attribute("alt")]
	anchors = await page.query_selector_all("a[href]")
	internal, external = 0, 0
	for a in anchors:
	href = await a.get_attribute("href")
	if href and href.startswith("http"):
	if url in href:
	internal += 1
	else:
	external += 1
	try:
	robots = await page.get_attribute("meta[name='robots']", "content")
	except Exception:
	robots = None

	try:
	canonical = await page.get_attribute("link[rel='canonical']", "href")
	except Exception:
	canonical = None
	return {
	"h1_count": h1_count,
	"missing_image_alts": missing_alts,
	"internal_links": internal,
	"external_links": external,
	"robots_meta": robots,
	"has_canonical": bool(canonical)
	}
	finally:
	await browser.close()
	await pw.stop()

	@app.get("/performance")
	async def performance_metrics(url: str):
	page, browser, pw = await get_page(url)
	try:
	# Get navigation timing
	try:
	nav_timing = await page.evaluate("JSON.stringify(performance.getEntriesByType('navigation'))")
	timing = json.loads(nav_timing)[0] if nav_timing else {}
	page_load_time = timing.get('duration', None)
	except Exception:
	page_load_time = None

	# Get First Contentful Paint
	try:
	fcp = await page.evaluate("performance.getEntriesByName('first-contentful-paint')[0]?.startTime")
	except Exception:
	fcp = None

	# Get Largest Contentful Paint
	try:
	lcp = await page.evaluate("performance.getEntriesByType('largest-contentful-paint')[0]?.renderTime")
	except Exception:
	lcp = None

	# Get Cumulative Layout Shift
	try:
	cls_entries = await page.evaluate("JSON.stringify(performance.getEntriesByType('layout-shift'))")
	cls = sum(e.get('value', 0) for e in json.loads(cls_entries) if isinstance(e, dict))
	except Exception:
	cls = None

	return {
	"page_load_time_ms": page_load_time,
	"first_contentful_paint": fcp,
	"largest_contentful_paint": lcp,
	"cumulative_layout_shift": cls
	}
	finally:
	await browser.close()
	await pw.stop()


	@app.get("/structured-data")
	async def structured_data(url: str):
	page, browser, pw = await get_page(url)
	try:
	scripts = await page.query_selector_all("script[type='application/ld+json']")
	json_ld_list = []
	for s in scripts:
	text = await s.inner_text()
	try:
	data = json.loads(text)
	json_ld_list.append(data)
	except Exception:
	continue
	types = []
	for obj in json_ld_list:
	if isinstance(obj, dict) and "@type" in obj:
	types.append(obj["@type"])
	return {
	"schema_found": bool(json_ld_list),
	"types": types,
	"schema": json_ld_list
	}
	finally:
	await browser.close()
	await pw.stop()


	@app.get("/accessibility")
	async def accessibility_check(url: str):
	page, browser, pw = await get_page(url)
	try:
	imgs = await page.query_selector_all("img")
	missing_alt = len([img for img in imgs if not await img.get_attribute("alt")])
	buttons = await page.query_selector_all("button")
	missing_labels = len([b for b in buttons if not await b.get_attribute("aria-label") and not await b.inner_text()])
	landmarks = []
	for tag in ["main", "nav", "footer", "header"]:
	if await page.query_selector(tag):
	landmarks.append(tag)
	return {
	"images_missing_alt": missing_alt,
	"buttons_missing_label": missing_labels,
	"landmarks": landmarks
	}
	finally:
	await browser.close()
	await pw.stop()