PyQuarX commited on
Commit
7e33257
·
verified ·
1 Parent(s): 90924f6

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +2 -11
scraper.py CHANGED
@@ -8,21 +8,12 @@ from bs4 import BeautifulSoup
8
  def scrape_website(website):
9
  print("Launching chrome browser...")
10
 
11
-
12
  chrome_driver_path = "/usr/bin/chromedriver"
13
-
14
-
15
  options = webdriver.ChromeOptions()
16
- options.add_argument('--headless')
17
- options.add_argument('--no-sandbox')
18
- options.add_argument('--disable-dev-shm-usage')
19
-
20
-
21
-
22
 
23
 
24
  try:
25
- driver = webdriver.Chrome(options=options)
26
  driver.get(website)
27
  print("Page Loaded...")
28
  html = driver.page_source
@@ -56,4 +47,4 @@ def clean_body_content(body_content):
56
  def split_dom_content(dom_content,max_length=60000):
57
  return [
58
  dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
59
- ]
 
8
  def scrape_website(website):
9
  print("Launching chrome browser...")
10
 
 
11
  chrome_driver_path = "/usr/bin/chromedriver"
 
 
12
  options = webdriver.ChromeOptions()
13
+ driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
 
 
 
 
 
14
 
15
 
16
  try:
 
17
  driver.get(website)
18
  print("Page Loaded...")
19
  html = driver.page_source
 
47
  def split_dom_content(dom_content,max_length=60000):
48
  return [
49
  dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
50
+ ]