Spaces:
Running
Running
File size: 1,393 Bytes
188a2fe 7fadeaa 188a2fe 646a14d 188a2fe 3d4df23 7fadeaa 659aea7 7fadeaa 659aea7 7fadeaa 3d4df23 7fadeaa c7ebd2b 188a2fe 7fadeaa 188a2fe 7fadeaa bf7ff08 7fadeaa 188a2fe 7fadeaa 3d4df23 bf7ff08 0f53b6e 7fadeaa 0f53b6e 7fadeaa 3d4df23 7fadeaa 0f53b6e 7fadeaa 0f53b6e 7fadeaa 3d4df23 7fadeaa 3d4df23 7fadeaa 3d4df23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from bs4 import BeautifulSoup
def scrape_website(website):
print("Launching chrome browser...")
chrome_driver_path = "/usr/bin/chromedriver"
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
try:
driver = webdriver.Chrome(options=options)
driver.get(website)
print("Page Loaded...")
html = driver.page_source
return html
finally:
driver.quit()
def extract_body_content(html_content):
soup = BeautifulSoup(html_content,"html.parser")
body_content = soup.body
if body_content:
return str(body_content)
return ""
def clean_body_content(body_content):
soup = BeautifulSoup(body_content,"html.parser")
for script_or_style in soup(["script","style"]):
script_or_style.extract()
cleaned_content = soup.get_text(separator="\n")
cleaned_content = "\n".join(
line.strip() for line in cleaned_content.splitlines() if line.strip()
)
return cleaned_content
def split_dom_content(dom_content,max_length=60000):
return [
dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
] |