Commit
·
f089137
1
Parent(s):
5611064
Working /experimenting
Browse files- screenshot.py +68 -23
screenshot.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
-
#
|
2 |
-
from fastapi import FastAPI, HTTPException
|
3 |
from pydantic import BaseModel
|
4 |
-
from typing import
|
5 |
import base64
|
6 |
import json
|
7 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
8 |
-
import asyncio
|
9 |
|
10 |
app = FastAPI(title="Web Analyzer API")
|
11 |
|
@@ -37,23 +36,42 @@ async def get_metadata(url: str):
|
|
37 |
page, browser, pw = await get_page(url)
|
38 |
try:
|
39 |
title = await page.title()
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
43 |
og = {}
|
44 |
for prop in ["title", "description", "image"]:
|
45 |
-
selector = f"meta[property='og:{prop}']"
|
46 |
try:
|
|
|
47 |
if await page.query_selector(selector):
|
48 |
og[f"og:{prop}"] = await page.get_attribute(selector, "content")
|
49 |
else:
|
50 |
og[f"og:{prop}"] = None
|
51 |
-
except Exception
|
52 |
og[f"og:{prop}"] = None
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
return {
|
58 |
"title": title,
|
59 |
"description": desc,
|
@@ -94,8 +112,15 @@ async def seo_audit(url: str):
|
|
94 |
internal += 1
|
95 |
else:
|
96 |
external += 1
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
return {
|
100 |
"h1_count": h1_count,
|
101 |
"missing_image_alts": missing_alts,
|
@@ -108,19 +133,39 @@ async def seo_audit(url: str):
|
|
108 |
await browser.close()
|
109 |
await pw.stop()
|
110 |
|
111 |
-
|
112 |
@app.get("/performance")
|
113 |
async def performance_metrics(url: str):
|
114 |
page, browser, pw = await get_page(url)
|
115 |
try:
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
return {
|
123 |
-
"page_load_time_ms":
|
124 |
"first_contentful_paint": fcp,
|
125 |
"largest_contentful_paint": lcp,
|
126 |
"cumulative_layout_shift": cls
|
@@ -176,4 +221,4 @@ async def accessibility_check(url: str):
|
|
176 |
}
|
177 |
finally:
|
178 |
await browser.close()
|
179 |
-
await pw.stop()
|
|
|
1 |
+
# scrape.py
|
2 |
+
from fastapi import FastAPI, HTTPException
|
3 |
from pydantic import BaseModel
|
4 |
+
from typing import Optional
|
5 |
import base64
|
6 |
import json
|
7 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
|
|
8 |
|
9 |
app = FastAPI(title="Web Analyzer API")
|
10 |
|
|
|
36 |
page, browser, pw = await get_page(url)
|
37 |
try:
|
38 |
title = await page.title()
|
39 |
+
|
40 |
+
# Get description meta tag
|
41 |
+
try:
|
42 |
+
desc = await page.get_attribute("meta[name='description']", "content")
|
43 |
+
except Exception:
|
44 |
+
desc = None
|
45 |
+
|
46 |
+
# Extract Open Graph metadata
|
47 |
og = {}
|
48 |
for prop in ["title", "description", "image"]:
|
|
|
49 |
try:
|
50 |
+
selector = f"meta[property='og:{prop}']"
|
51 |
if await page.query_selector(selector):
|
52 |
og[f"og:{prop}"] = await page.get_attribute(selector, "content")
|
53 |
else:
|
54 |
og[f"og:{prop}"] = None
|
55 |
+
except Exception:
|
56 |
og[f"og:{prop}"] = None
|
57 |
|
58 |
+
# Extract Twitter metadata
|
59 |
+
twitter = {}
|
60 |
+
for prop in ["title", "description", "image"]:
|
61 |
+
try:
|
62 |
+
selector = f"meta[name='twitter:{prop}']"
|
63 |
+
if await page.query_selector(selector):
|
64 |
+
twitter[f"twitter:{prop}"] = await page.get_attribute(selector, "content")
|
65 |
+
else:
|
66 |
+
twitter[f"twitter:{prop}"] = None
|
67 |
+
except Exception:
|
68 |
+
twitter[f"twitter:{prop}"] = None
|
69 |
+
|
70 |
+
# Get canonical URL
|
71 |
+
try:
|
72 |
+
canonical = await page.get_attribute("link[rel='canonical']", "href")
|
73 |
+
except Exception:
|
74 |
+
canonical = None
|
75 |
return {
|
76 |
"title": title,
|
77 |
"description": desc,
|
|
|
112 |
internal += 1
|
113 |
else:
|
114 |
external += 1
|
115 |
+
try:
|
116 |
+
robots = await page.get_attribute("meta[name='robots']", "content")
|
117 |
+
except Exception:
|
118 |
+
robots = None
|
119 |
+
|
120 |
+
try:
|
121 |
+
canonical = await page.get_attribute("link[rel='canonical']", "href")
|
122 |
+
except Exception:
|
123 |
+
canonical = None
|
124 |
return {
|
125 |
"h1_count": h1_count,
|
126 |
"missing_image_alts": missing_alts,
|
|
|
133 |
await browser.close()
|
134 |
await pw.stop()
|
135 |
|
|
|
136 |
@app.get("/performance")
|
137 |
async def performance_metrics(url: str):
|
138 |
page, browser, pw = await get_page(url)
|
139 |
try:
|
140 |
+
# Get navigation timing
|
141 |
+
try:
|
142 |
+
nav_timing = await page.evaluate("JSON.stringify(performance.getEntriesByType('navigation'))")
|
143 |
+
timing = json.loads(nav_timing)[0] if nav_timing else {}
|
144 |
+
page_load_time = timing.get('duration', None)
|
145 |
+
except Exception:
|
146 |
+
page_load_time = None
|
147 |
+
|
148 |
+
# Get First Contentful Paint
|
149 |
+
try:
|
150 |
+
fcp = await page.evaluate("performance.getEntriesByName('first-contentful-paint')[0]?.startTime")
|
151 |
+
except Exception:
|
152 |
+
fcp = None
|
153 |
+
|
154 |
+
# Get Largest Contentful Paint
|
155 |
+
try:
|
156 |
+
lcp = await page.evaluate("performance.getEntriesByType('largest-contentful-paint')[0]?.renderTime")
|
157 |
+
except Exception:
|
158 |
+
lcp = None
|
159 |
+
|
160 |
+
# Get Cumulative Layout Shift
|
161 |
+
try:
|
162 |
+
cls_entries = await page.evaluate("JSON.stringify(performance.getEntriesByType('layout-shift'))")
|
163 |
+
cls = sum(e.get('value', 0) for e in json.loads(cls_entries) if isinstance(e, dict))
|
164 |
+
except Exception:
|
165 |
+
cls = None
|
166 |
+
|
167 |
return {
|
168 |
+
"page_load_time_ms": page_load_time,
|
169 |
"first_contentful_paint": fcp,
|
170 |
"largest_contentful_paint": lcp,
|
171 |
"cumulative_layout_shift": cls
|
|
|
221 |
}
|
222 |
finally:
|
223 |
await browser.close()
|
224 |
+
await pw.stop()
|