PyQuarX commited on
Commit
3a92801
·
verified ·
1 Parent(s): 1ab0ddc

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +3 -19
scraper.py CHANGED
@@ -1,20 +1,20 @@
1
  from selenium import webdriver
2
  from selenium.webdriver.chrome.service import Service
3
  from selenium.webdriver.chrome.options import Options
4
- import time
5
  from bs4 import BeautifulSoup
6
 
7
  def scrape_website(website):
8
  print("Launching chromium browser...")
9
 
10
- chrome_driver_path = "/usr/lib/chromium/chromedriver"
11
 
12
  options = Options()
13
- options.binary_location = "/usr/bin/chromium"
14
  options.add_argument("--headless")
15
  options.add_argument("--no-sandbox")
16
  options.add_argument("--disable-dev-shm-usage")
17
 
 
18
  driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
19
 
20
  try:
@@ -25,19 +25,3 @@ def scrape_website(website):
25
  finally:
26
  driver.quit()
27
 
28
-
29
- def extract_body_content(html_content):
30
- soup = BeautifulSoup(html_content, "html.parser")
31
- body_content = soup.body
32
- return str(body_content) if body_content else ""
33
-
34
- def clean_body_content(body_content):
35
- soup = BeautifulSoup(body_content, "html.parser")
36
- for script_or_style in soup(["script", "style"]):
37
- script_or_style.extract()
38
-
39
- cleaned_content = soup.get_text(separator="\n")
40
- return "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
41
-
42
- def split_dom_content(dom_content, max_length=60000):
43
- return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]
 
1
  from selenium import webdriver
2
  from selenium.webdriver.chrome.service import Service
3
  from selenium.webdriver.chrome.options import Options
 
4
  from bs4 import BeautifulSoup
5
 
6
  def scrape_website(website):
7
  print("Launching chromium browser...")
8
 
9
+ chrome_driver_path = "/usr/lib/chromium/chromedriver" # dépend de ton Dockerfile
10
 
11
  options = Options()
12
+ options.binary_location = "/usr/bin/chromium" # important !
13
  options.add_argument("--headless")
14
  options.add_argument("--no-sandbox")
15
  options.add_argument("--disable-dev-shm-usage")
16
 
17
+ # ✅ C’est ici qu’il fallait mettre options
18
  driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
19
 
20
  try:
 
25
  finally:
26
  driver.quit()
27