Azzan Dwi Riski commited on
Commit
7646a9b
·
1 Parent(s): c0af825

update the code to handle ads and cloudflare challenge fixed2

Browse files
Files changed (1) hide show
  1. app.py +31 -16
app.py CHANGED
@@ -260,29 +260,36 @@ def create_browser_context(playwright):
260
  )
261
 
262
  def setup_request_interception(page):
263
- redirect_count = 0
264
 
265
  def handle_request(route):
266
- nonlocal redirect_count
267
  request = route.request
 
268
 
269
  # Block known ad/tracking patterns
270
- if any(pattern in request.url.lower() for pattern in BLOCK_PATTERNS):
271
- print(f"Blocking request to: {request.url}")
272
  route.abort()
273
  return
274
-
275
- # Handle redirects
276
- if request.redirect_chain:
277
- redirect_count += 1
278
- if redirect_count > MAX_REDIRECTS:
279
- print(f"Too many redirects ({redirect_count}), aborting request")
280
- route.abort()
281
- return
282
-
 
283
  # Continue with the request
284
  route.continue_()
285
 
 
 
 
 
 
 
286
  page.route("**/*", handle_request)
287
 
288
  def take_screenshot(url):
@@ -305,17 +312,22 @@ def take_screenshot(url):
305
  print("Attempting to navigate to URL...")
306
  response = page.goto(
307
  url,
308
- wait_until="commit", # Changed to commit instead of domcontentloaded
309
  timeout=PAGE_TIMEOUT
310
  )
311
 
312
  if not response:
313
  print("No response received, attempting to continue...")
 
 
314
  elif response.status >= 400:
315
  print(f"Received error status code: {response.status}")
316
 
317
  # Try to wait for the page to be more stable
318
- wait_for_page_stable(page)
 
 
 
319
 
320
  # Take screenshot even if page might not be fully loaded
321
  print("Taking screenshot...")
@@ -333,7 +345,10 @@ def take_screenshot(url):
333
  except:
334
  raise nav_error
335
  finally:
336
- context.close()
 
 
 
337
 
338
  if os.path.exists(filepath):
339
  print(f"Screenshot saved successfully to {filepath}")
 
260
  )
261
 
262
  def setup_request_interception(page):
263
+ redirect_urls = set()
264
 
265
  def handle_request(route):
 
266
  request = route.request
267
+ url = request.url
268
 
269
  # Block known ad/tracking patterns
270
+ if any(pattern in url.lower() for pattern in BLOCK_PATTERNS):
271
+ print(f"Blocking request to: {url}")
272
  route.abort()
273
  return
274
+
275
+ # Track potential redirects by monitoring navigation requests
276
+ if request.resource_type == "document":
277
+ if url in redirect_urls:
278
+ if len(redirect_urls) > MAX_REDIRECTS:
279
+ print(f"Too many redirects (>{MAX_REDIRECTS}), aborting request")
280
+ route.abort()
281
+ return
282
+ redirect_urls.add(url)
283
+
284
  # Continue with the request
285
  route.continue_()
286
 
287
+ # Listen for response events to detect redirects
288
+ def handle_response(response):
289
+ if response.status >= 300 and response.status <= 399:
290
+ redirect_urls.add(response.url)
291
+
292
+ page.on("response", handle_response)
293
  page.route("**/*", handle_request)
294
 
295
  def take_screenshot(url):
 
312
  print("Attempting to navigate to URL...")
313
  response = page.goto(
314
  url,
315
+ wait_until="commit",
316
  timeout=PAGE_TIMEOUT
317
  )
318
 
319
  if not response:
320
  print("No response received, attempting to continue...")
321
+ elif response.status >= 300 and response.status <= 399:
322
+ print(f"Received redirect status code: {response.status}")
323
  elif response.status >= 400:
324
  print(f"Received error status code: {response.status}")
325
 
326
  # Try to wait for the page to be more stable
327
+ try:
328
+ wait_for_page_stable(page)
329
+ except Exception as e:
330
+ print(f"Page stability warning: {e}")
331
 
332
  # Take screenshot even if page might not be fully loaded
333
  print("Taking screenshot...")
 
345
  except:
346
  raise nav_error
347
  finally:
348
+ try:
349
+ context.close()
350
+ except Exception as close_error:
351
+ print(f"Warning: Error while closing context: {close_error}")
352
 
353
  if os.path.exists(filepath):
354
  print(f"Screenshot saved successfully to {filepath}")