apexherbert200 commited on
Commit
2f927bc
·
1 Parent(s): f6118d7

Working /experimenting

Browse files
Files changed (2) hide show
  1. screenshot.py +161 -82
  2. webrify.py +90 -0
screenshot.py CHANGED
@@ -1,90 +1,169 @@
 
1
  from fastapi import FastAPI, HTTPException, Query
2
  from pydantic import BaseModel
3
- from playwright.async_api import async_playwright
4
- import asyncio
5
  import base64
6
- import time
7
- from typing import Optional, List
8
- import uvicorn
9
- import logging
 
10
 
11
- app = FastAPI()
12
 
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger("analyzer")
15
 
16
- class AnalysisResult(BaseModel):
17
- url: str
18
- load_time: float
19
  title: Optional[str]
20
- meta_description: Optional[str]
21
- og_image: Optional[str]
22
- seo_flags: List[str]
23
- accessibility_flags: List[str]
24
- screenshot_base64: str
25
- status_code: Optional[int] = None
26
-
27
- @app.get("/analyze", response_model=AnalysisResult)
28
- async def analyze_website(url: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  try:
30
- async with async_playwright() as p:
31
- browser = await p.chromium.launch(headless=True)
32
- context = await browser.new_context()
33
- page = await context.new_page()
34
-
35
- # Start timing
36
- start_time = time.time()
37
- response = await page.goto(url, timeout=60000, wait_until='domcontentloaded')
38
- await page.wait_for_load_state("networkidle")
39
- load_time = round(time.time() - start_time, 2)
40
-
41
- # Screenshot
42
- screenshot = await page.screenshot(full_page=True)
43
- screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
44
-
45
- # Title and meta info
46
- title = await page.title()
47
- meta_description = await page.eval_on_selector("meta[name='description']", "el => el.content") if await page.query_selector("meta[name='description']") else None
48
- og_image = await page.eval_on_selector("meta[property='og:image']", "el => el.content") if await page.query_selector("meta[property='og:image']") else None
49
-
50
- # SEO flags
51
- seo_flags = []
52
- if not title:
53
- seo_flags.append("Missing <title>")
54
- if not meta_description:
55
- seo_flags.append("Missing meta description")
56
- if not await page.query_selector("h1"):
57
- seo_flags.append("Missing <h1> tag")
58
- if not og_image:
59
- seo_flags.append("Missing Open Graph image")
60
-
61
- # Accessibility flags
62
- accessibility_flags = []
63
- images = await page.query_selector_all("img")
64
- for img in images:
65
- has_alt = await img.get_attribute("alt")
66
- if not has_alt:
67
- accessibility_flags.append("Image without alt attribute")
68
- break
69
-
70
- status_code = response.status if response else None
71
-
72
- await browser.close()
73
-
74
- return AnalysisResult(
75
- url=url,
76
- load_time=load_time,
77
- title=title,
78
- meta_description=meta_description,
79
- og_image=og_image,
80
- seo_flags=seo_flags,
81
- accessibility_flags=accessibility_flags,
82
- screenshot_base64=screenshot_base64,
83
- status_code=status_code
84
- )
85
- except Exception as e:
86
- logger.error(f"Analysis failed for {url}: {str(e)}")
87
- raise HTTPException(status_code=500, detail=f"Error analyzing {url}: {str(e)}")
88
-
89
- if __name__ == "__main__":
90
- uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
 
1
+ # main.py
2
  from fastapi import FastAPI, HTTPException, Query
3
  from pydantic import BaseModel
4
+ from typing import List, Optional
 
5
  import base64
6
+ import json
7
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
8
+ import asyncio
9
+
10
+ app = FastAPI(title="Web Analyzer API")
11
 
 
12
 
13
+ class ScreenshotResponse(BaseModel):
14
+ screenshot: str
15
 
16
+ class MetadataResponse(BaseModel):
 
 
17
  title: Optional[str]
18
+ description: Optional[str]
19
+ og: dict
20
+ twitter: dict
21
+ canonical: Optional[str]
22
+
23
+
24
+ async def get_page(url):
25
+ pw = await async_playwright().start()
26
+ browser = await pw.chromium.launch(headless=True)
27
+ page = await browser.new_page()
28
+ try:
29
+ await page.goto(url, timeout=30000)
30
+ except PlaywrightTimeoutError:
31
+ raise HTTPException(status_code=504, detail="Page load timed out")
32
+ return page, browser, pw
33
+
34
+
35
+ @app.get("/metadata", response_model=MetadataResponse)
36
+ async def get_metadata(url: str):
37
+ page, browser, pw = await get_page(url)
38
+ try:
39
+ title = await page.title()
40
+ desc = await page.get_attribute("meta[name='description']", "content")
41
+ og = {}
42
+ twitter = {}
43
+ for prop in ["title", "description", "image"]:
44
+ og[f"og:{prop}"] = await page.get_attribute(f"meta[property='og:{prop}']", "content")
45
+ twitter[f"twitter:{prop}"] = await page.get_attribute(f"meta[name='twitter:{prop}']", "content")
46
+ canonical = await page.get_attribute("link[rel='canonical']", "href")
47
+ return {
48
+ "title": title,
49
+ "description": desc,
50
+ "og": og,
51
+ "twitter": twitter,
52
+ "canonical": canonical
53
+ }
54
+ finally:
55
+ await browser.close()
56
+ await pw.stop()
57
+
58
+
59
+ @app.get("/screenshot", response_model=ScreenshotResponse)
60
+ async def get_screenshot(url: str):
61
+ page, browser, pw = await get_page(url)
62
+ try:
63
+ image_bytes = await page.screenshot(full_page=True)
64
+ image_base64 = base64.b64encode(image_bytes).decode()
65
+ return {"screenshot": image_base64}
66
+ finally:
67
+ await browser.close()
68
+ await pw.stop()
69
+
70
+
71
+ @app.get("/seo")
72
+ async def seo_audit(url: str):
73
+ page, browser, pw = await get_page(url)
74
+ try:
75
+ h1_count = await page.locator("h1").count()
76
+ imgs = await page.query_selector_all("img")
77
+ missing_alts = [await img.get_attribute("src") for img in imgs if not await img.get_attribute("alt")]
78
+ anchors = await page.query_selector_all("a[href]")
79
+ internal, external = 0, 0
80
+ for a in anchors:
81
+ href = await a.get_attribute("href")
82
+ if href and href.startswith("http"):
83
+ if url in href:
84
+ internal += 1
85
+ else:
86
+ external += 1
87
+ robots = await page.get_attribute("meta[name='robots']", "content")
88
+ canonical = await page.get_attribute("link[rel='canonical']", "href")
89
+ return {
90
+ "h1_count": h1_count,
91
+ "missing_image_alts": missing_alts,
92
+ "internal_links": internal,
93
+ "external_links": external,
94
+ "robots_meta": robots,
95
+ "has_canonical": bool(canonical)
96
+ }
97
+ finally:
98
+ await browser.close()
99
+ await pw.stop()
100
+
101
+
102
+ @app.get("/performance")
103
+ async def performance_metrics(url: str):
104
+ page, browser, pw = await get_page(url)
105
+ try:
106
+ nav_timing = await page.evaluate("JSON.stringify(performance.getEntriesByType('navigation'))")
107
+ timing = json.loads(nav_timing)[0]
108
+ fcp = await page.evaluate("performance.getEntriesByName('first-contentful-paint')[0]?.startTime")
109
+ lcp = await page.evaluate("performance.getEntriesByType('largest-contentful-paint')[0]?.renderTime")
110
+ cls_entries = await page.evaluate("JSON.stringify(performance.getEntriesByType('layout-shift'))")
111
+ cls = sum(e['value'] for e in json.loads(cls_entries))
112
+ return {
113
+ "page_load_time_ms": timing['duration'],
114
+ "first_contentful_paint": fcp,
115
+ "largest_contentful_paint": lcp,
116
+ "cumulative_layout_shift": cls
117
+ }
118
+ finally:
119
+ await browser.close()
120
+ await pw.stop()
121
+
122
+
123
+ @app.get("/structured-data")
124
+ async def structured_data(url: str):
125
+ page, browser, pw = await get_page(url)
126
+ try:
127
+ scripts = await page.query_selector_all("script[type='application/ld+json']")
128
+ json_ld_list = []
129
+ for s in scripts:
130
+ text = await s.inner_text()
131
+ try:
132
+ data = json.loads(text)
133
+ json_ld_list.append(data)
134
+ except Exception:
135
+ continue
136
+ types = []
137
+ for obj in json_ld_list:
138
+ if isinstance(obj, dict) and "@type" in obj:
139
+ types.append(obj["@type"])
140
+ return {
141
+ "schema_found": bool(json_ld_list),
142
+ "types": types,
143
+ "schema": json_ld_list
144
+ }
145
+ finally:
146
+ await browser.close()
147
+ await pw.stop()
148
+
149
+
150
+ @app.get("/accessibility")
151
+ async def accessibility_check(url: str):
152
+ page, browser, pw = await get_page(url)
153
  try:
154
+ imgs = await page.query_selector_all("img")
155
+ missing_alt = len([img for img in imgs if not await img.get_attribute("alt")])
156
+ buttons = await page.query_selector_all("button")
157
+ missing_labels = len([b for b in buttons if not await b.get_attribute("aria-label") and not await b.inner_text()])
158
+ landmarks = []
159
+ for tag in ["main", "nav", "footer", "header"]:
160
+ if await page.query_selector(tag):
161
+ landmarks.append(tag)
162
+ return {
163
+ "images_missing_alt": missing_alt,
164
+ "buttons_missing_label": missing_labels,
165
+ "landmarks": landmarks
166
+ }
167
+ finally:
168
+ await browser.close()
169
+ await pw.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webrify.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+ from pydantic import BaseModel
3
+ from playwright.async_api import async_playwright
4
+ import asyncio
5
+ import base64
6
+ import time
7
+ from typing import Optional, List
8
+ import uvicorn
9
+ import logging
10
+
11
+ app = FastAPI()
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger("analyzer")
15
+
16
+ class AnalysisResult(BaseModel):
17
+ url: str
18
+ load_time: float
19
+ title: Optional[str]
20
+ meta_description: Optional[str]
21
+ og_image: Optional[str]
22
+ seo_flags: List[str]
23
+ accessibility_flags: List[str]
24
+ screenshot_base64: str
25
+ status_code: Optional[int] = None
26
+
27
+ @app.get("/analyze", response_model=AnalysisResult)
28
+ async def analyze_website(url: str):
29
+ try:
30
+ async with async_playwright() as p:
31
+ browser = await p.chromium.launch(headless=True)
32
+ context = await browser.new_context()
33
+ page = await context.new_page()
34
+
35
+ # Start timing
36
+ start_time = time.time()
37
+ response = await page.goto(url, timeout=60000, wait_until='domcontentloaded')
38
+ await page.wait_for_load_state("networkidle")
39
+ load_time = round(time.time() - start_time, 2)
40
+
41
+ # Screenshot
42
+ screenshot = await page.screenshot(full_page=True)
43
+ screenshot_base64 = base64.b64encode(screenshot).decode("utf-8")
44
+
45
+ # Title and meta info
46
+ title = await page.title()
47
+ meta_description = await page.eval_on_selector("meta[name='description']", "el => el.content") if await page.query_selector("meta[name='description']") else None
48
+ og_image = await page.eval_on_selector("meta[property='og:image']", "el => el.content") if await page.query_selector("meta[property='og:image']") else None
49
+
50
+ # SEO flags
51
+ seo_flags = []
52
+ if not title:
53
+ seo_flags.append("Missing <title>")
54
+ if not meta_description:
55
+ seo_flags.append("Missing meta description")
56
+ if not await page.query_selector("h1"):
57
+ seo_flags.append("Missing <h1> tag")
58
+ if not og_image:
59
+ seo_flags.append("Missing Open Graph image")
60
+
61
+ # Accessibility flags
62
+ accessibility_flags = []
63
+ images = await page.query_selector_all("img")
64
+ for img in images:
65
+ has_alt = await img.get_attribute("alt")
66
+ if not has_alt:
67
+ accessibility_flags.append("Image without alt attribute")
68
+ break
69
+
70
+ status_code = response.status if response else None
71
+
72
+ await browser.close()
73
+
74
+ return AnalysisResult(
75
+ url=url,
76
+ load_time=load_time,
77
+ title=title,
78
+ meta_description=meta_description,
79
+ og_image=og_image,
80
+ seo_flags=seo_flags,
81
+ accessibility_flags=accessibility_flags,
82
+ screenshot_base64=screenshot_base64,
83
+ status_code=status_code
84
+ )
85
+ except Exception as e:
86
+ logger.error(f"Analysis failed for {url}: {str(e)}")
87
+ raise HTTPException(status_code=500, detail=f"Error analyzing {url}: {str(e)}")
88
+
89
+ if __name__ == "__main__":
90
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)