File size: 8,383 Bytes
f089137 99b81db f089137 99b81db 2f927bc c9d5bf0 99b81db 2f927bc 99b81db 2f927bc 820622d 2f927bc d3fac2e 2f927bc d3fac2e 2f927bc d3fac2e 2f927bc d3fac2e 2f927bc f089137 5611064 2f927bc 5611064 f089137 5611064 f089137 5611064 f089137 2f927bc f089137 2f927bc f089137 2f927bc f089137 2f927bc 99b81db 2f927bc f089137 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 |
# scrape.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
import base64
import json
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
app = FastAPI(title="Web Analyzer API")
class ScreenshotResponse(BaseModel):
screenshot: str
class MetadataResponse(BaseModel):
title: Optional[str]
description: Optional[str]
og: dict
twitter: dict
canonical: Optional[str]
# async def get_page(url):
# pw = await async_playwright().start()
# browser = await pw.chromium.launch(headless=True)
# page = await browser.new_page()
# try:
# await page.goto(url, timeout=30000)
# except PlaywrightTimeoutError:
# raise HTTPException(status_code=504, detail="Page load timed out")
# return page, browser, pw
async def get_page(url):
pw = await async_playwright().start()
browser = await pw.chromium.launch(headless=True)
context = await browser.new_context()
# Stealth: hide headless detection
await context.add_init_script(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
)
page = await context.new_page()
page.set_default_timeout(90000) # Apply to all waits
try:
# Try networkidle first (wait for full load)
await page.goto(url, timeout=90000, wait_until="networkidle")
await page.wait_for_selector("body", timeout=10000) # Ensure DOM is visible
except PlaywrightTimeoutError:
try:
# Fallback to lighter load event
await page.goto(url, timeout=90000, wait_until="load")
except Exception as e:
await browser.close()
await pw.stop()
raise HTTPException(status_code=504, detail=f"Page load failed: {str(e)}")
return page, browser, pw
@app.get("/metadata", response_model=MetadataResponse)
async def get_metadata(url: str):
page, browser, pw = await get_page(url)
try:
title = await page.title()
# Get description meta tag
try:
desc = await page.get_attribute("meta[name='description']", "content")
except Exception:
desc = None
# Extract Open Graph metadata
og = {}
for prop in ["title", "description", "image"]:
try:
selector = f"meta[property='og:{prop}']"
if await page.query_selector(selector):
og[f"og:{prop}"] = await page.get_attribute(selector, "content")
else:
og[f"og:{prop}"] = None
except Exception:
og[f"og:{prop}"] = None
# Extract Twitter metadata
twitter = {}
for prop in ["title", "description", "image"]:
try:
selector = f"meta[name='twitter:{prop}']"
if await page.query_selector(selector):
twitter[f"twitter:{prop}"] = await page.get_attribute(selector, "content")
else:
twitter[f"twitter:{prop}"] = None
except Exception:
twitter[f"twitter:{prop}"] = None
# Get canonical URL
try:
canonical = await page.get_attribute("link[rel='canonical']", "href")
except Exception:
canonical = None
return {
"title": title,
"description": desc,
"og": og,
"twitter": twitter,
"canonical": canonical
}
finally:
await browser.close()
await pw.stop()
@app.get("/screenshot", response_model=ScreenshotResponse)
async def get_screenshot(url: str):
page, browser, pw = await get_page(url)
try:
image_bytes = await page.screenshot(full_page=True)
image_base64 = base64.b64encode(image_bytes).decode()
return {"screenshot": image_base64}
finally:
await browser.close()
await pw.stop()
@app.get("/seo")
async def seo_audit(url: str):
page, browser, pw = await get_page(url)
try:
h1_count = await page.locator("h1").count()
imgs = await page.query_selector_all("img")
missing_alts = [await img.get_attribute("src") for img in imgs if not await img.get_attribute("alt")]
anchors = await page.query_selector_all("a[href]")
internal, external = 0, 0
for a in anchors:
href = await a.get_attribute("href")
if href and href.startswith("http"):
if url in href:
internal += 1
else:
external += 1
try:
robots = await page.get_attribute("meta[name='robots']", "content")
except Exception:
robots = None
try:
canonical = await page.get_attribute("link[rel='canonical']", "href")
except Exception:
canonical = None
return {
"h1_count": h1_count,
"missing_image_alts": missing_alts,
"internal_links": internal,
"external_links": external,
"robots_meta": robots,
"has_canonical": bool(canonical)
}
finally:
await browser.close()
await pw.stop()
@app.get("/performance")
async def performance_metrics(url: str):
page, browser, pw = await get_page(url)
try:
# Get navigation timing
try:
nav_timing = await page.evaluate("JSON.stringify(performance.getEntriesByType('navigation'))")
timing = json.loads(nav_timing)[0] if nav_timing else {}
page_load_time = timing.get('duration', None)
except Exception:
page_load_time = None
# Get First Contentful Paint
try:
fcp = await page.evaluate("performance.getEntriesByName('first-contentful-paint')[0]?.startTime")
except Exception:
fcp = None
# Get Largest Contentful Paint
try:
lcp = await page.evaluate("performance.getEntriesByType('largest-contentful-paint')[0]?.renderTime")
except Exception:
lcp = None
# Get Cumulative Layout Shift
try:
cls_entries = await page.evaluate("JSON.stringify(performance.getEntriesByType('layout-shift'))")
cls = sum(e.get('value', 0) for e in json.loads(cls_entries) if isinstance(e, dict))
except Exception:
cls = None
return {
"page_load_time_ms": page_load_time,
"first_contentful_paint": fcp,
"largest_contentful_paint": lcp,
"cumulative_layout_shift": cls
}
finally:
await browser.close()
await pw.stop()
@app.get("/structured-data")
async def structured_data(url: str):
page, browser, pw = await get_page(url)
try:
scripts = await page.query_selector_all("script[type='application/ld+json']")
json_ld_list = []
for s in scripts:
text = await s.inner_text()
try:
data = json.loads(text)
json_ld_list.append(data)
except Exception:
continue
types = []
for obj in json_ld_list:
if isinstance(obj, dict) and "@type" in obj:
types.append(obj["@type"])
return {
"schema_found": bool(json_ld_list),
"types": types,
"schema": json_ld_list
}
finally:
await browser.close()
await pw.stop()
@app.get("/accessibility")
async def accessibility_check(url: str):
page, browser, pw = await get_page(url)
try:
imgs = await page.query_selector_all("img")
missing_alt = len([img for img in imgs if not await img.get_attribute("alt")])
buttons = await page.query_selector_all("button")
missing_labels = len([b for b in buttons if not await b.get_attribute("aria-label") and not await b.inner_text()])
landmarks = []
for tag in ["main", "nav", "footer", "header"]:
if await page.query_selector(tag):
landmarks.append(tag)
return {
"images_missing_alt": missing_alt,
"buttons_missing_label": missing_labels,
"landmarks": landmarks
}
finally:
await browser.close()
await pw.stop() |