Azzan Dwi Riski
commited on
Commit
·
7646a9b
1
Parent(s):
c0af825
update the code to handle ads and cloudflare challenge fixed2
Browse files
app.py
CHANGED
@@ -260,29 +260,36 @@ def create_browser_context(playwright):
|
|
260 |
)
|
261 |
|
262 |
def setup_request_interception(page):
|
263 |
-
|
264 |
|
265 |
def handle_request(route):
|
266 |
-
nonlocal redirect_count
|
267 |
request = route.request
|
|
|
268 |
|
269 |
# Block known ad/tracking patterns
|
270 |
-
if any(pattern in
|
271 |
-
print(f"Blocking request to: {
|
272 |
route.abort()
|
273 |
return
|
274 |
-
|
275 |
-
#
|
276 |
-
if request.
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
|
|
283 |
# Continue with the request
|
284 |
route.continue_()
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
page.route("**/*", handle_request)
|
287 |
|
288 |
def take_screenshot(url):
|
@@ -305,17 +312,22 @@ def take_screenshot(url):
|
|
305 |
print("Attempting to navigate to URL...")
|
306 |
response = page.goto(
|
307 |
url,
|
308 |
-
wait_until="commit",
|
309 |
timeout=PAGE_TIMEOUT
|
310 |
)
|
311 |
|
312 |
if not response:
|
313 |
print("No response received, attempting to continue...")
|
|
|
|
|
314 |
elif response.status >= 400:
|
315 |
print(f"Received error status code: {response.status}")
|
316 |
|
317 |
# Try to wait for the page to be more stable
|
318 |
-
|
|
|
|
|
|
|
319 |
|
320 |
# Take screenshot even if page might not be fully loaded
|
321 |
print("Taking screenshot...")
|
@@ -333,7 +345,10 @@ def take_screenshot(url):
|
|
333 |
except:
|
334 |
raise nav_error
|
335 |
finally:
|
336 |
-
|
|
|
|
|
|
|
337 |
|
338 |
if os.path.exists(filepath):
|
339 |
print(f"Screenshot saved successfully to {filepath}")
|
|
|
260 |
)
|
261 |
|
262 |
def setup_request_interception(page):
|
263 |
+
redirect_urls = set()
|
264 |
|
265 |
def handle_request(route):
|
|
|
266 |
request = route.request
|
267 |
+
url = request.url
|
268 |
|
269 |
# Block known ad/tracking patterns
|
270 |
+
if any(pattern in url.lower() for pattern in BLOCK_PATTERNS):
|
271 |
+
print(f"Blocking request to: {url}")
|
272 |
route.abort()
|
273 |
return
|
274 |
+
|
275 |
+
# Track potential redirects by monitoring navigation requests
|
276 |
+
if request.resource_type == "document":
|
277 |
+
if url in redirect_urls:
|
278 |
+
if len(redirect_urls) > MAX_REDIRECTS:
|
279 |
+
print(f"Too many redirects (>{MAX_REDIRECTS}), aborting request")
|
280 |
+
route.abort()
|
281 |
+
return
|
282 |
+
redirect_urls.add(url)
|
283 |
+
|
284 |
# Continue with the request
|
285 |
route.continue_()
|
286 |
|
287 |
+
# Listen for response events to detect redirects
|
288 |
+
def handle_response(response):
|
289 |
+
if response.status >= 300 and response.status <= 399:
|
290 |
+
redirect_urls.add(response.url)
|
291 |
+
|
292 |
+
page.on("response", handle_response)
|
293 |
page.route("**/*", handle_request)
|
294 |
|
295 |
def take_screenshot(url):
|
|
|
312 |
print("Attempting to navigate to URL...")
|
313 |
response = page.goto(
|
314 |
url,
|
315 |
+
wait_until="commit",
|
316 |
timeout=PAGE_TIMEOUT
|
317 |
)
|
318 |
|
319 |
if not response:
|
320 |
print("No response received, attempting to continue...")
|
321 |
+
elif response.status >= 300 and response.status <= 399:
|
322 |
+
print(f"Received redirect status code: {response.status}")
|
323 |
elif response.status >= 400:
|
324 |
print(f"Received error status code: {response.status}")
|
325 |
|
326 |
# Try to wait for the page to be more stable
|
327 |
+
try:
|
328 |
+
wait_for_page_stable(page)
|
329 |
+
except Exception as e:
|
330 |
+
print(f"Page stability warning: {e}")
|
331 |
|
332 |
# Take screenshot even if page might not be fully loaded
|
333 |
print("Taking screenshot...")
|
|
|
345 |
except:
|
346 |
raise nav_error
|
347 |
finally:
|
348 |
+
try:
|
349 |
+
context.close()
|
350 |
+
except Exception as close_error:
|
351 |
+
print(f"Warning: Error while closing context: {close_error}")
|
352 |
|
353 |
if os.path.exists(filepath):
|
354 |
print(f"Screenshot saved successfully to {filepath}")
|