PyQuarX commited on
Commit
0f53b6e
·
verified ·
1 Parent(s): 3a92801

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +25 -0
scraper.py CHANGED
@@ -24,4 +24,29 @@ def scrape_website(website):
24
  return html
25
  finally:
26
  driver.quit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
24
  return html
25
  finally:
26
  driver.quit()
27
+
28
+ def extract_body_content(html_content):
29
+ soup = BeautifulSoup(html_content,"html.parser")
30
+ body_content = soup.body
31
+ if body_content:
32
+ return str(body_content)
33
+ return ""
34
+
35
+ def clean_body_content(body_content):
36
+ soup = BeautifulSoup(body_content,"html.parser")
37
+
38
+ for script_or_style in soup(["script","style"]):
39
+ script_or_style.extract()
40
+
41
+ cleaned_content = soup.get_text(separator="\n")
42
+ cleaned_content = "\n".join(
43
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
44
+ )
45
+
46
+ return cleaned_content
47
+
48
+ def split_dom_content(dom_content,max_length=60000):
49
+ return [
50
+ dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
51
+ ]
52