Spaces:

PyQuarX
/

scrape-with-ai

Running

File size: 1,586 Bytes

188a2fe
 
1ab0ddc
188a2fe
646a14d
188a2fe
1ab0ddc
188a2fe
3a92801
1ab0ddc
 
3a92801
1ab0ddc
 
 
636189b
3a92801
636189b
c7ebd2b
188a2fe
 
1ab0ddc
188a2fe
 
 
 
0f53b6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188a2fe

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

def scrape_website(website):
    print("Launching chromium browser...")

    chrome_driver_path = "/usr/lib/chromium/chromedriver"  # dépend de ton Dockerfile

    options = Options()
    options.binary_location = "/usr/bin/chromium"  # important !
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # ✅ C’est ici qu’il fallait mettre options
    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

    try:
        driver.get(website)
        print("Page Loaded...")
        html = driver.page_source
        return html
    finally:
        driver.quit()
        
def extract_body_content(html_content):
    soup = BeautifulSoup(html_content,"html.parser")
    body_content = soup.body
    if body_content:
        return str(body_content)
    return ""

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content,"html.parser")

    for script_or_style in soup(["script","style"]):
        script_or_style.extract()
    
    cleaned_content = soup.get_text(separator="\n")
    cleaned_content = "\n".join(
        line.strip() for line in cleaned_content.splitlines() if line.strip()
    )

    return cleaned_content

def split_dom_content(dom_content,max_length=60000):
    return [
        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
    ]