Spaces:

PyQuarX
/

scrape-with-ai

Running

File size: 1,414 Bytes

188a2fe
 
1ab0ddc
 
188a2fe
646a14d
188a2fe
1ab0ddc
188a2fe
636189b
1ab0ddc
 
636189b
1ab0ddc
 
 
636189b
 
c7ebd2b
188a2fe
 
1ab0ddc
188a2fe
 
 
 
 
646a14d
188a2fe
fed0ea1
188a2fe
fed0ea1
188a2fe
 
fed0ea1
 
188a2fe
1ab0ddc
188a2fe
fed0ea1
188a2fe
fed0ea1
1ab0ddc

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup

def scrape_website(website):
    print("Launching chromium browser...")

    chrome_driver_path = "/usr/lib/chromium/chromedriver"

    options = Options()
    options.binary_location = "/usr/bin/chromium"
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

    try:
        driver.get(website)
        print("Page Loaded...")
        html = driver.page_source
        return html
    finally:
        driver.quit()


def extract_body_content(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    body_content = soup.body
    return str(body_content) if body_content else ""

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content, "html.parser")
    for script_or_style in soup(["script", "style"]):
        script_or_style.extract()

    cleaned_content = soup.get_text(separator="\n")
    return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())

def split_dom_content(dom_content, max_length=60000):
    return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]