PyQuarX commited on
Commit
fed0ea1
·
verified ·
1 Parent(s): 64528a3

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +15 -25
scraper.py CHANGED
@@ -1,9 +1,7 @@
1
  from selenium import webdriver
2
  from selenium.webdriver.chrome.service import Service
3
- import time
4
  from bs4 import BeautifulSoup
5
-
6
-
7
 
8
  def scrape_website(website):
9
  print("Launching chrome browser...")
@@ -11,47 +9,39 @@ def scrape_website(website):
11
  chrome_driver_path = "/usr/bin/chromedriver"
12
  chrome_binary_path = "/usr/bin/chromium-browser"
13
 
 
 
 
 
 
14
  options = webdriver.ChromeOptions()
15
  options.binary_location = chrome_binary_path
16
  options.add_argument("--headless")
17
  options.add_argument("--no-sandbox")
18
  options.add_argument("--disable-dev-shm-usage")
19
 
20
- driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
 
21
 
22
  try:
23
  driver.get(website)
24
  print("Page Loaded...")
25
  html = driver.page_source
26
-
27
  return html
28
-
29
  finally:
30
  driver.quit()
31
 
32
-
33
-
34
  def extract_body_content(html_content):
35
- soup = BeautifulSoup(html_content,"html.parser")
36
  body_content = soup.body
37
- if body_content:
38
- return str(body_content)
39
- return ""
40
 
41
  def clean_body_content(body_content):
42
- soup = BeautifulSoup(body_content,"html.parser")
43
-
44
- for script_or_style in soup(["script","style"]):
45
  script_or_style.extract()
46
-
47
  cleaned_content = soup.get_text(separator="\n")
48
- cleaned_content = "\n".join(
49
- line.strip() for line in cleaned_content.splitlines() if line.strip()
50
- )
51
-
52
- return cleaned_content
53
 
54
- def split_dom_content(dom_content,max_length=60000):
55
- return [
56
- dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
57
- ]
 
1
  from selenium import webdriver
2
  from selenium.webdriver.chrome.service import Service
 
3
  from bs4 import BeautifulSoup
4
+ import os
 
5
 
6
  def scrape_website(website):
7
  print("Launching chrome browser...")
 
9
  chrome_driver_path = "/usr/bin/chromedriver"
10
  chrome_binary_path = "/usr/bin/chromium-browser"
11
 
12
+ if not os.path.exists(chrome_driver_path):
13
+ raise FileNotFoundError(f"Chromedriver not found at {chrome_driver_path}")
14
+ if not os.path.exists(chrome_binary_path):
15
+ raise FileNotFoundError(f"Chromium not found at {chrome_binary_path}")
16
+
17
  options = webdriver.ChromeOptions()
18
  options.binary_location = chrome_binary_path
19
  options.add_argument("--headless")
20
  options.add_argument("--no-sandbox")
21
  options.add_argument("--disable-dev-shm-usage")
22
 
23
+ service = Service(executable_path=chrome_driver_path)
24
+ driver = webdriver.Chrome(service=service, options=options)
25
 
26
  try:
27
  driver.get(website)
28
  print("Page Loaded...")
29
  html = driver.page_source
 
30
  return html
 
31
  finally:
32
  driver.quit()
33
 
 
 
34
  def extract_body_content(html_content):
35
+ soup = BeautifulSoup(html_content, "html.parser")
36
  body_content = soup.body
37
+ return str(body_content) if body_content else ""
 
 
38
 
39
  def clean_body_content(body_content):
40
+ soup = BeautifulSoup(body_content, "html.parser")
41
+ for script_or_style in soup(["script", "style"]):
 
42
  script_or_style.extract()
 
43
  cleaned_content = soup.get_text(separator="\n")
44
+ return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
 
 
 
 
45
 
46
+ def split_dom_content(dom_content, max_length=60000):
47
+ return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]