Spaces:

PyQuarX
/

scrape-with-ai

Paused

PyQuarX commited on Apr 15

Commit

0f53b6e

verified ·

1 Parent(s): 3a92801

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -24,4 +24,29 @@ def scrape_website(website):
         return html
     finally:
         driver.quit()

         return html
     finally:
         driver.quit()
+def extract_body_content(html_content):
+    soup = BeautifulSoup(html_content,"html.parser")
+    body_content = soup.body
+    if body_content:
+        return str(body_content)
+    return ""
+def clean_body_content(body_content):
+    soup = BeautifulSoup(body_content,"html.parser")
+    for script_or_style in soup(["script","style"]):
+        script_or_style.extract()
+    cleaned_content = soup.get_text(separator="\n")
+    cleaned_content = "\n".join(
+        line.strip() for line in cleaned_content.splitlines() if line.strip()
+    )
+    return cleaned_content
+def split_dom_content(dom_content,max_length=60000):
+    return [
+        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
+    ]