apexherbert200 commited on
Commit
050c5eb
·
1 Parent(s): 7b2b9d3

Added middleware

Browse files
Files changed (1) hide show
  1. webrify2.py +65 -24
webrify2.py CHANGED
@@ -4,6 +4,7 @@ from pydantic import BaseModel
4
  from typing import Optional
5
  import base64
6
  import json
 
7
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
8
 
9
  app = FastAPI(title="Web Analyzer API")
@@ -19,47 +20,87 @@ class MetadataResponse(BaseModel):
19
  twitter: dict
20
  canonical: Optional[str]
21
 
 
 
 
 
 
 
22
 
23
- # async def get_page(url):
24
- # pw = await async_playwright().start()
25
- # browser = await pw.chromium.launch(headless=True)
26
- # page = await browser.new_page()
27
- # try:
28
- # await page.goto(url, timeout=30000)
29
- # except PlaywrightTimeoutError:
30
- # raise HTTPException(status_code=504, detail="Page load timed out")
31
- # return page, browser, pw
32
-
33
-
34
-
35
  async def get_page(url):
 
 
36
  pw = await async_playwright().start()
37
  browser = await pw.chromium.launch(headless=True)
38
  context = await browser.new_context()
39
 
40
- # Stealth: hide headless detection
41
  await context.add_init_script(
42
  "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
43
  )
44
 
45
  page = await context.new_page()
46
- page.set_default_timeout(90000) # Apply to all waits
47
 
48
  try:
49
- # Try networkidle first (wait for full load)
50
- await page.goto(url, timeout=90000, wait_until="networkidle")
51
- await page.wait_for_selector("body", timeout=10000) # Ensure DOM is visible
52
- except PlaywrightTimeoutError:
53
  try:
54
- # Fallback to lighter load event
55
- await page.goto(url, timeout=90000, wait_until="load")
56
- except Exception as e:
57
- await browser.close()
58
- await pw.stop()
59
- raise HTTPException(status_code=504, detail=f"Page load failed: {str(e)}")
 
 
 
 
60
 
 
 
 
 
 
 
 
61
  return page, browser, pw
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  @app.middleware("http")
64
  async def remove_leaky_headers(request: Request, call_next):
65
  response: Response = await call_next(request)
 
4
  from typing import Optional
5
  import base64
6
  import json
7
+ import asyncio
8
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
9
 
10
  app = FastAPI(title="Web Analyzer API")
 
20
  twitter: dict
21
  canonical: Optional[str]
22
 
23
+ # Optional timeout wrapper to enforce global timeout
24
+ async def timeout_wrapper(coro, timeout=20):
25
+ try:
26
+ return await asyncio.wait_for(coro, timeout)
27
+ except asyncio.TimeoutError:
28
+ raise HTTPException(status_code=504, detail="Operation timed out")
29
 
30
+ # More robust get_page() with fallbacks, stealth, and logging
 
 
 
 
 
 
 
 
 
 
 
31
  async def get_page(url):
32
+ print(f"[INFO] Visiting URL: {url}")
33
+
34
  pw = await async_playwright().start()
35
  browser = await pw.chromium.launch(headless=True)
36
  context = await browser.new_context()
37
 
38
+ # Stealth mode: prevent simple headless detection
39
  await context.add_init_script(
40
  "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
41
  )
42
 
43
  page = await context.new_page()
44
+ page.set_default_timeout(20000) # 20s max for waits on elements
45
 
46
  try:
 
 
 
 
47
  try:
48
+ print("[INFO] Trying to load with 'domcontentloaded'")
49
+ await page.goto(url, wait_until="domcontentloaded", timeout=20000)
50
+ except PlaywrightTimeoutError:
51
+ print("[WARN] domcontentloaded failed, trying 'load'")
52
+ await page.goto(url, wait_until="load", timeout=20000)
53
+
54
+ try:
55
+ await page.wait_for_selector("body", timeout=5000)
56
+ except Exception:
57
+ print("[WARN] <body> not found quickly. May still continue.")
58
 
59
+ except Exception as e:
60
+ print(f"[ERROR] Page load failed for {url}: {e}")
61
+ await browser.close()
62
+ await pw.stop()
63
+ raise HTTPException(status_code=504, detail=f"Page load failed: {str(e)}")
64
+
65
+ print("[INFO] Page loaded successfully.")
66
  return page, browser, pw
67
 
68
+
69
+
70
+
71
+
72
+
73
+
74
+ # async def get_page(url):
75
+ # pw = await async_playwright().start()
76
+ # browser = await pw.chromium.launch(headless=True)
77
+ # context = await browser.new_context()
78
+
79
+ # # Stealth: hide headless detection
80
+ # await context.add_init_script(
81
+ # "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
82
+ # )
83
+
84
+ # page = await context.new_page()
85
+ # page.set_default_timeout(90000) # Apply to all waits
86
+
87
+ # try:
88
+ # # Try networkidle first (wait for full load)
89
+ # await page.goto(url, timeout=90000, wait_until="networkidle")
90
+ # await page.wait_for_selector("body", timeout=10000) # Ensure DOM is visible
91
+ # except PlaywrightTimeoutError:
92
+ # try:
93
+ # # Fallback to lighter load event
94
+ # await page.goto(url, timeout=90000, wait_until="load")
95
+ # except Exception as e:
96
+ # await browser.close()
97
+ # await pw.stop()
98
+ # raise HTTPException(status_code=504, detail=f"Page load failed: {str(e)}")
99
+
100
+ # return page, browser, pw
101
+
102
+
103
+
104
  @app.middleware("http")
105
  async def remove_leaky_headers(request: Request, call_next):
106
  response: Response = await call_next(request)