Spaces:

PyQuarX
/

scrape-with-ai

Running

File size: 1,605 Bytes

188a2fe
 
 
fed0ea1
f03ff54
188a2fe
646a14d
188a2fe
 
 
646a14d
 
c7ebd2b
646a14d
 
fed0ea1
188a2fe
c7ebd2b
 
 
 
 
fed0ea1
 
188a2fe
 
 
 
 
 
 
 
646a14d
188a2fe
fed0ea1
188a2fe
fed0ea1
188a2fe
 
fed0ea1
 
188a2fe
 
fed0ea1
188a2fe
fed0ea1

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import os
from shutil import which


def scrape_website(website):
    print("Launching chrome browser...")

    chrome_driver_path = which("chromedriver")
    chrome_binary_path = which("chromium-browser") or which("chromium")

    if not chrome_driver_path or not chrome_binary_path:
        raise EnvironmentError("chromedriver or chromium-browser not found in PATH")

    options = webdriver.ChromeOptions()
    options.binary_location = chrome_binary_path
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    service = Service(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(website)
        html = driver.page_source
        return html
    finally:
        driver.quit()


def extract_body_content(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    body_content = soup.body
    return str(body_content) if body_content else ""

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content, "html.parser")
    for script_or_style in soup(["script", "style"]):
        script_or_style.extract()
    cleaned_content = soup.get_text(separator="\n")
    return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())

def split_dom_content(dom_content, max_length=60000):
    return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]