Spaces:

MicroHealth
/

website-to-pdf

Sleeping

bluenevus commited on Apr 13

Commit

12928b4

verified ·

1 Parent(s): 0f462a3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,19 +2,32 @@ import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
-from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import letter
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.platypus import SimpleDocTemplate, Paragraph
 from io import BytesIO
 import tempfile
 def get_subdirectory_pages(url, base_url, visited=set()):
     if url in visited:
         return []
     visited.add(url)
-    pages = [url]
     try:
         response = requests.get(url)
@@ -41,20 +54,11 @@ def website_to_pdf(url):
     styles = getSampleStyleSheet()
     story = []
-    for page_url in all_pages:
         story.append(Paragraph(f"<b>{page_url}</b>", styles['Heading1']))
-        try:
-            response = requests.get(page_url)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Extract text from specific tags
-            for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
-                for element in soup.find_all(tag):
-                    text = element.get_text(strip=True)
-                    if text:
-                        story.append(Paragraph(text, styles['BodyText']))
-        except Exception as e:
-            story.append(Paragraph(f"Error processing {page_url}: {str(e)}", styles['BodyText']))
     doc.build(story)

 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 from reportlab.lib.pagesizes import letter
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.platypus import SimpleDocTemplate, Paragraph
 from io import BytesIO
 import tempfile
+def get_page_content(url):
+    try:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        content = []
+        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
+            for element in soup.find_all(tag):
+                text = element.get_text(strip=True)
+                if text:
+                    content.append(text)
+        return content
+    except Exception as e:
+        return [f"Error processing {url}: {str(e)}"]
 def get_subdirectory_pages(url, base_url, visited=set()):
     if url in visited:
         return []
     visited.add(url)
+    pages = [(url, get_page_content(url))]
     try:
         response = requests.get(url)
     styles = getSampleStyleSheet()
     story = []
+    for page_url, content in all_pages:
         story.append(Paragraph(f"<b>{page_url}</b>", styles['Heading1']))
+        for text in content:
+            story.append(Paragraph(text, styles['BodyText']))
+        story.append(Paragraph("<br/><br/>", styles['BodyText']))
     doc.build(story)