Spaces:
Running
Running
File size: 1,667 Bytes
188a2fe 7fadeaa 188a2fe f64f0d6 646a14d 188a2fe 3d4df23 7fadeaa 659aea7 7fadeaa f64f0d6 c7ebd2b 188a2fe 7fadeaa bf7ff08 7fadeaa f64f0d6 188a2fe 7fadeaa 3d4df23 bf7ff08 0f53b6e 7fadeaa 0f53b6e 7fadeaa 3d4df23 7fadeaa 0f53b6e 7fadeaa 0f53b6e 7fadeaa 3d4df23 7fadeaa 3d4df23 7fadeaa 7e33257 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from bs4 import BeautifulSoup
import tempfile
def scrape_website(website):
print("Launching chrome browser...")
chrome_driver_path = "/usr/bin/chromedriver"
options = webdriver.ChromeOptions()
# Essential arguments for headless environments like Hugging Face
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Use a unique user data directory to prevent session conflicts
unique_user_data_dir = tempfile.mkdtemp()
options.add_argument(f"--user-data-dir={unique_user_data_dir}")
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
try:
driver.get(website)
print("Page Loaded...")
html = driver.page_source
return html
finally:
driver.quit()
def extract_body_content(html_content):
soup = BeautifulSoup(html_content,"html.parser")
body_content = soup.body
if body_content:
return str(body_content)
return ""
def clean_body_content(body_content):
soup = BeautifulSoup(body_content,"html.parser")
for script_or_style in soup(["script","style"]):
script_or_style.extract()
cleaned_content = soup.get_text(separator="\n")
cleaned_content = "\n".join(
line.strip() for line in cleaned_content.splitlines() if line.strip()
)
return cleaned_content
def split_dom_content(dom_content,max_length=60000):
return [
dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
]
|