Spaces:

MicroHealth
/

website-to-pdf

Sleeping

bluenevus commited on Apr 13

Commit

2ad1f6f

verified ·

1 Parent(s): 04286a4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,13 +37,17 @@ def get_page_content(url):
 def get_all_doc_links(url):
     try:
         response = requests.get(url, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         main_content = soup.find('main')
         if main_content:
             links = main_content.find_all('a', href=True)
-            return [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
         return []
     except Exception as e:
         logger.error(f"Error getting links from {url}: {str(e)}")
@@ -58,14 +62,18 @@ def website_to_pdf(url):
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
-    pdf.set_font("Arial", size=12)
     for page_url in all_links:
         content = get_page_content(page_url)
         pdf.cell(0, 10, txt=page_url, ln=True)
         pdf.ln(5)
         for text in content:
-            pdf.multi_cell(0, 10, txt=text)
         pdf.add_page()
     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:

 def get_all_doc_links(url):
     try:
+        logger.info(f"Fetching links from: {url}")
         response = requests.get(url, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         main_content = soup.find('main')
         if main_content:
             links = main_content.find_all('a', href=True)
+            doc_links = [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
+            logger.info(f"Found {len(doc_links)} documentation links")
+            return doc_links
+        logger.warning("No main content found on the page")
         return []
     except Exception as e:
         logger.error(f"Error getting links from {url}: {str(e)}")
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
+    pdf.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
+    pdf.set_font('DejaVu', size=12)
     for page_url in all_links:
         content = get_page_content(page_url)
         pdf.cell(0, 10, txt=page_url, ln=True)
         pdf.ln(5)
         for text in content:
+            try:
+                pdf.multi_cell(0, 10, txt=text)
+            except Exception as e:
+                logger.error(f"Error writing text to PDF: {str(e)}")
         pdf.add_page()
     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: