Spaces:
Running
Running
File size: 1,414 Bytes
188a2fe 1ab0ddc 188a2fe 646a14d 188a2fe 1ab0ddc 188a2fe 636189b 1ab0ddc 636189b 1ab0ddc 636189b c7ebd2b 188a2fe 1ab0ddc 188a2fe 646a14d 188a2fe fed0ea1 188a2fe fed0ea1 188a2fe fed0ea1 188a2fe 1ab0ddc 188a2fe fed0ea1 188a2fe fed0ea1 1ab0ddc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
def scrape_website(website):
print("Launching chromium browser...")
chrome_driver_path = "/usr/lib/chromium/chromedriver"
options = Options()
options.binary_location = "/usr/bin/chromium"
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
try:
driver.get(website)
print("Page Loaded...")
html = driver.page_source
return html
finally:
driver.quit()
def extract_body_content(html_content):
soup = BeautifulSoup(html_content, "html.parser")
body_content = soup.body
return str(body_content) if body_content else ""
def clean_body_content(body_content):
soup = BeautifulSoup(body_content, "html.parser")
for script_or_style in soup(["script", "style"]):
script_or_style.extract()
cleaned_content = soup.get_text(separator="\n")
return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
def split_dom_content(dom_content, max_length=60000):
return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]
|