Spaces:
Sleeping
Sleeping
updated for reddit.com
Browse files
main.py
CHANGED
@@ -36,15 +36,17 @@ async def scrape_visible_text(url):
|
|
36 |
try:
|
37 |
logger.info(f"Starting to scrape visible text from URL: {url}")
|
38 |
async with async_playwright() as p:
|
39 |
-
browser
|
|
|
40 |
context = await browser.new_context(
|
41 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
42 |
viewport={"width": 1280, "height": 800},
|
43 |
extra_http_headers={
|
44 |
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
45 |
"accept-encoding": "gzip, deflate, br, zstd",
|
46 |
-
"accept-language": "en-US,en;q=0.9
|
47 |
-
"cache-control": "
|
|
|
48 |
"sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
|
49 |
"sec-ch-ua-mobile": "?0",
|
50 |
"sec-ch-ua-platform": '"Windows"',
|
@@ -52,15 +54,20 @@ async def scrape_visible_text(url):
|
|
52 |
"sec-fetch-mode": "navigate",
|
53 |
"sec-fetch-site": "none",
|
54 |
"sec-fetch-user": "?1",
|
55 |
-
"upgrade-insecure-requests": "1"
|
56 |
}
|
57 |
)
|
|
|
58 |
page = await context.new_page()
|
59 |
-
await page.goto(url, wait_until="domcontentloaded")
|
|
|
|
|
60 |
visible_text = await page.evaluate("document.body.innerText")
|
|
|
61 |
await browser.close()
|
62 |
logger.info(f"Successfully scraped visible text from URL: {url}")
|
63 |
return visible_text
|
|
|
64 |
except Exception as e:
|
65 |
logger.error(f"Error while scraping visible text from URL {url}: {e}")
|
66 |
raise
|
|
|
36 |
try:
|
37 |
logger.info(f"Starting to scrape visible text from URL: {url}")
|
38 |
async with async_playwright() as p:
|
39 |
+
# Launch the browser in headless mode (can change to headless=False if you want to see it)
|
40 |
+
browser = await p.chromium.launch(headless=True)
|
41 |
context = await browser.new_context(
|
42 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
43 |
viewport={"width": 1280, "height": 800},
|
44 |
extra_http_headers={
|
45 |
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
46 |
"accept-encoding": "gzip, deflate, br, zstd",
|
47 |
+
"accept-language": "en-US,en;q=0.9",
|
48 |
+
"cache-control": "no-cache",
|
49 |
+
"pragma": "no-cache",
|
50 |
"sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
|
51 |
"sec-ch-ua-mobile": "?0",
|
52 |
"sec-ch-ua-platform": '"Windows"',
|
|
|
54 |
"sec-fetch-mode": "navigate",
|
55 |
"sec-fetch-site": "none",
|
56 |
"sec-fetch-user": "?1",
|
57 |
+
"upgrade-insecure-requests": "1",
|
58 |
}
|
59 |
)
|
60 |
+
|
61 |
page = await context.new_page()
|
62 |
+
await page.goto(url, wait_until="domcontentloaded") # Wait until the DOM is fully loaded
|
63 |
+
|
64 |
+
# Extract visible text from the body of the page
|
65 |
visible_text = await page.evaluate("document.body.innerText")
|
66 |
+
|
67 |
await browser.close()
|
68 |
logger.info(f"Successfully scraped visible text from URL: {url}")
|
69 |
return visible_text
|
70 |
+
|
71 |
except Exception as e:
|
72 |
logger.error(f"Error while scraping visible text from URL {url}: {e}")
|
73 |
raise
|