precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 23, 2024

Commit

42c440b

verified ·

1 Parent(s): 0f2ff7e

updated for reddit.com

Browse files

Files changed (1) hide show

main.py +12 -5

main.py CHANGED Viewed

@@ -36,15 +36,17 @@ async def scrape_visible_text(url):
     try:
         logger.info(f"Starting to scrape visible text from URL: {url}")
         async with async_playwright() as p:
-            browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
             context = await browser.new_context(
                 user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
                 viewport={"width": 1280, "height": 800},
                 extra_http_headers={
                     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                     "accept-encoding": "gzip, deflate, br, zstd",
-                    "accept-language": "en-US,en;q=0.9,hi;q=0.8",
-                    "cache-control": "max-age=0",
                     "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
                     "sec-ch-ua-mobile": "?0",
                     "sec-ch-ua-platform": '"Windows"',
@@ -52,15 +54,20 @@ async def scrape_visible_text(url):
                     "sec-fetch-mode": "navigate",
                     "sec-fetch-site": "none",
                     "sec-fetch-user": "?1",
-                    "upgrade-insecure-requests": "1"
                 }
             )
             page = await context.new_page()
-            await page.goto(url, wait_until="domcontentloaded")
             visible_text = await page.evaluate("document.body.innerText")
             await browser.close()
             logger.info(f"Successfully scraped visible text from URL: {url}")
             return visible_text
     except Exception as e:
         logger.error(f"Error while scraping visible text from URL {url}: {e}")
         raise

     try:
         logger.info(f"Starting to scrape visible text from URL: {url}")
         async with async_playwright() as p:
+            # Launch the browser in headless mode (can change to headless=False if you want to see it)
+            browser = await p.chromium.launch(headless=True)
             context = await browser.new_context(
                 user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
                 viewport={"width": 1280, "height": 800},
                 extra_http_headers={
                     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                     "accept-encoding": "gzip, deflate, br, zstd",
+                    "accept-language": "en-US,en;q=0.9",
+                    "cache-control": "no-cache",
+                    "pragma": "no-cache",
                     "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
                     "sec-ch-ua-mobile": "?0",
                     "sec-ch-ua-platform": '"Windows"',
                     "sec-fetch-mode": "navigate",
                     "sec-fetch-site": "none",
                     "sec-fetch-user": "?1",
+                    "upgrade-insecure-requests": "1",
                 }
             )
             page = await context.new_page()
+            await page.goto(url, wait_until="domcontentloaded")  # Wait until the DOM is fully loaded
+            # Extract visible text from the body of the page
             visible_text = await page.evaluate("document.body.innerText")
             await browser.close()
             logger.info(f"Successfully scraped visible text from URL: {url}")
             return visible_text
     except Exception as e:
         logger.error(f"Error while scraping visible text from URL {url}: {e}")
         raise