adityaiiitr commited on
Commit
42c440b
·
verified ·
1 Parent(s): 0f2ff7e

updated for reddit.com

Browse files
Files changed (1) hide show
  1. main.py +12 -5
main.py CHANGED
@@ -36,15 +36,17 @@ async def scrape_visible_text(url):
36
  try:
37
  logger.info(f"Starting to scrape visible text from URL: {url}")
38
  async with async_playwright() as p:
39
- browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
 
40
  context = await browser.new_context(
41
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
42
  viewport={"width": 1280, "height": 800},
43
  extra_http_headers={
44
  "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
45
  "accept-encoding": "gzip, deflate, br, zstd",
46
- "accept-language": "en-US,en;q=0.9,hi;q=0.8",
47
- "cache-control": "max-age=0",
 
48
  "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
49
  "sec-ch-ua-mobile": "?0",
50
  "sec-ch-ua-platform": '"Windows"',
@@ -52,15 +54,20 @@ async def scrape_visible_text(url):
52
  "sec-fetch-mode": "navigate",
53
  "sec-fetch-site": "none",
54
  "sec-fetch-user": "?1",
55
- "upgrade-insecure-requests": "1"
56
  }
57
  )
 
58
  page = await context.new_page()
59
- await page.goto(url, wait_until="domcontentloaded")
 
 
60
  visible_text = await page.evaluate("document.body.innerText")
 
61
  await browser.close()
62
  logger.info(f"Successfully scraped visible text from URL: {url}")
63
  return visible_text
 
64
  except Exception as e:
65
  logger.error(f"Error while scraping visible text from URL {url}: {e}")
66
  raise
 
36
  try:
37
  logger.info(f"Starting to scrape visible text from URL: {url}")
38
  async with async_playwright() as p:
39
+ # Launch the browser in headless mode (can change to headless=False if you want to see it)
40
+ browser = await p.chromium.launch(headless=True)
41
  context = await browser.new_context(
42
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
43
  viewport={"width": 1280, "height": 800},
44
  extra_http_headers={
45
  "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
46
  "accept-encoding": "gzip, deflate, br, zstd",
47
+ "accept-language": "en-US,en;q=0.9",
48
+ "cache-control": "no-cache",
49
+ "pragma": "no-cache",
50
  "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
51
  "sec-ch-ua-mobile": "?0",
52
  "sec-ch-ua-platform": '"Windows"',
 
54
  "sec-fetch-mode": "navigate",
55
  "sec-fetch-site": "none",
56
  "sec-fetch-user": "?1",
57
+ "upgrade-insecure-requests": "1",
58
  }
59
  )
60
+
61
  page = await context.new_page()
62
+ await page.goto(url, wait_until="domcontentloaded") # Wait until the DOM is fully loaded
63
+
64
+ # Extract visible text from the body of the page
65
  visible_text = await page.evaluate("document.body.innerText")
66
+
67
  await browser.close()
68
  logger.info(f"Successfully scraped visible text from URL: {url}")
69
  return visible_text
70
+
71
  except Exception as e:
72
  logger.error(f"Error while scraping visible text from URL {url}: {e}")
73
  raise