File size: 1,667 Bytes
188a2fe
 
7fadeaa
188a2fe
f64f0d6
646a14d
188a2fe
3d4df23
7fadeaa
 
659aea7
7fadeaa
 
f64f0d6
 
 
 
 
 
 
 
 
 
 
c7ebd2b
188a2fe
 
7fadeaa
 
bf7ff08
7fadeaa
f64f0d6
188a2fe
7fadeaa
3d4df23
bf7ff08
0f53b6e
7fadeaa
 
 
 
0f53b6e
 
 
7fadeaa
3d4df23
7fadeaa
0f53b6e
7fadeaa
 
 
 
 
0f53b6e
7fadeaa
3d4df23
7fadeaa
3d4df23
7fadeaa
7e33257
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from bs4 import BeautifulSoup
import tempfile



def scrape_website(website):
    print("Launching chrome browser...")

    chrome_driver_path = "/usr/bin/chromedriver"
    options = webdriver.ChromeOptions()

    # Essential arguments for headless environments like Hugging Face
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Use a unique user data directory to prevent session conflicts
    unique_user_data_dir = tempfile.mkdtemp()
    options.add_argument(f"--user-data-dir={unique_user_data_dir}")

    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

    try:
        driver.get(website)
        print("Page Loaded...")
        html = driver.page_source

        return html

    finally:
        driver.quit()


def extract_body_content(html_content):
    soup = BeautifulSoup(html_content,"html.parser")
    body_content = soup.body
    if body_content:
        return str(body_content)
    return ""

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content,"html.parser")

    for script_or_style in soup(["script","style"]):
        script_or_style.extract()
    
    cleaned_content = soup.get_text(separator="\n")
    cleaned_content = "\n".join(
        line.strip() for line in cleaned_content.splitlines() if line.strip()
    )

    return cleaned_content

def split_dom_content(dom_content,max_length=60000):
    return [
        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
    ]