Spaces:

PyQuarX
/

scrape-with-ai

Running

File size: 1,468 Bytes

188a2fe
 
1ab0ddc
188a2fe
646a14d
188a2fe
bf7ff08
188a2fe
bf7ff08
 
bd742c8
 
bf7ff08
bd742c8
659aea7
 
bf7ff08
c7ebd2b
188a2fe
 
1ab0ddc
188a2fe
bf7ff08
188a2fe
bf7ff08
188a2fe
 
bf7ff08
0f53b6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188a2fe

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

def scrape_website(website):
    print("Launching chrome browser...")

    chrome_driver_path = "/usr/bin/chromedriver"
    options = webdriver.ChromeOptions()
    user_data_dir = tempfile.mkdtemp()
    options.add_argument("--user-data-dir=/tmp/chrome-user-data")
    driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
    user_data_dir = tempfile.mkdtemp()


    

    try:
        driver.get(website)
        print("Page Loaded...")
        html = driver.page_source

        return html
    
    finally:
        driver.quit()

        
def extract_body_content(html_content):
    soup = BeautifulSoup(html_content,"html.parser")
    body_content = soup.body
    if body_content:
        return str(body_content)
    return ""

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content,"html.parser")

    for script_or_style in soup(["script","style"]):
        script_or_style.extract()
    
    cleaned_content = soup.get_text(separator="\n")
    cleaned_content = "\n".join(
        line.strip() for line in cleaned_content.splitlines() if line.strip()
    )

    return cleaned_content

def split_dom_content(dom_content,max_length=60000):
    return [
        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
    ]