PyQuarX commited on
Commit
bf7ff08
·
verified ·
1 Parent(s): 0f53b6e

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +8 -11
scraper.py CHANGED
@@ -4,26 +4,23 @@ from selenium.webdriver.chrome.options import Options
4
  from bs4 import BeautifulSoup
5
 
6
  def scrape_website(website):
7
- print("Launching chromium browser...")
8
 
9
- chrome_driver_path = "/usr/lib/chromium/chromedriver" # dépend de ton Dockerfile
10
-
11
- options = Options()
12
- options.binary_location = "/usr/bin/chromium" # important !
13
- options.add_argument("--headless")
14
- options.add_argument("--no-sandbox")
15
- options.add_argument("--disable-dev-shm-usage")
16
-
17
- # ✅ C’est ici qu’il fallait mettre options
18
- driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
19
 
20
  try:
21
  driver.get(website)
22
  print("Page Loaded...")
23
  html = driver.page_source
 
24
  return html
 
25
  finally:
26
  driver.quit()
 
27
 
28
  def extract_body_content(html_content):
29
  soup = BeautifulSoup(html_content,"html.parser")
 
4
  from bs4 import BeautifulSoup
5
 
6
  def scrape_website(website):
7
+ print("Launching chrome browser...")
8
 
9
+ chrome_driver_path = "/usr/bin/chromedriver"
10
+ options = webdriver.ChromeOptions()
11
+ driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
12
+
 
 
 
 
 
 
13
 
14
  try:
15
  driver.get(website)
16
  print("Page Loaded...")
17
  html = driver.page_source
18
+
19
  return html
20
+
21
  finally:
22
  driver.quit()
23
+
24
 
25
  def extract_body_content(html_content):
26
  soup = BeautifulSoup(html_content,"html.parser")