Spaces:
Paused
Paused
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| import time | |
| from bs4 import BeautifulSoup | |
| import tempfile | |
| def scrape_website(website): | |
| print("Launching chrome browser...") | |
| chrome_driver_path = "/usr/bin/chromedriver" | |
| options = webdriver.ChromeOptions() | |
| # Essential arguments for headless environments like Hugging Face | |
| options.add_argument("--headless") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| # Use a unique user data directory to prevent session conflicts | |
| unique_user_data_dir = tempfile.mkdtemp() | |
| options.add_argument(f"--user-data-dir={unique_user_data_dir}") | |
| driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) | |
| try: | |
| driver.get(website) | |
| print("Page Loaded...") | |
| html = driver.page_source | |
| return html | |
| finally: | |
| driver.quit() | |
| def extract_body_content(html_content): | |
| soup = BeautifulSoup(html_content,"html.parser") | |
| body_content = soup.body | |
| if body_content: | |
| return str(body_content) | |
| return "" | |
| def clean_body_content(body_content): | |
| soup = BeautifulSoup(body_content,"html.parser") | |
| for script_or_style in soup(["script","style"]): | |
| script_or_style.extract() | |
| cleaned_content = soup.get_text(separator="\n") | |
| cleaned_content = "\n".join( | |
| line.strip() for line in cleaned_content.splitlines() if line.strip() | |
| ) | |
| return cleaned_content | |
| def split_dom_content(dom_content,max_length=60000): | |
| return [ | |
| dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length) | |
| ] | |