Spaces:

MicroHealth
/

website-to-pdf

Sleeping

App Files Files Community

bluenevus commited on Apr 13

Commit

cb8ca6c

verified ·

1 Parent(s): d6fec81

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -22

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ def get_page_content(url):
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         content = []
-        main_content = soup.find('main')
         if main_content:
             for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
                 for element in main_content.find_all(tag):
@@ -35,37 +35,48 @@ def get_page_content(url):
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
-def get_all_doc_links(url):
     try:
-        logger.info(f"Fetching links from: {url}")
         response = requests.get(url, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
-        main_content = soup.find('main')
-        if main_content:
-            links = main_content.find_all('a', href=True)
-            doc_links = [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
-            logger.info(f"Found {len(doc_links)} documentation links")
-            return doc_links
-        logger.warning("No main content found on the page")
-        return []
     except Exception as e:
         logger.error(f"Error getting links from {url}: {str(e)}")
         return []
-def website_to_pdf(url):
-    base_url = "https://www.gradio.app/docs"
-    logger.info(f"Starting to process: {base_url}")
-    all_links = get_all_doc_links(base_url)
-    logger.info(f"Found {len(all_links)} pages to process")
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
-    for page_url in all_links:
-        content = get_page_content(page_url)
         pdf.cell(0, 10, txt=page_url, ln=True)
         pdf.ln(5)
         for text in content:
@@ -82,9 +93,9 @@ def website_to_pdf(url):
     return pdf_path
-def process_url(url):
     try:
-        pdf_file = website_to_pdf(url)
         return pdf_file
     except Exception as e:
         logger.error(f"Error in process_url: {str(e)}")
@@ -92,10 +103,13 @@ def process_url(url):
 iface = gr.Interface(
     fn=process_url,
-    inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
     outputs=gr.File(label="Download PDF"),
     title="Gradio Documentation to PDF Converter",
-    description="Enter the Gradio docs URL to convert all documentation pages into a PDF."
 )
 if __name__ == "__main__":

         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         content = []
+        main_content = soup.find('article') or soup.find('main') or soup
         if main_content:
             for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
                 for element in main_content.find_all(tag):
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
+def get_links(url):
     try:
         response = requests.get(url, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
+        links = soup.find_all('a', href=True)
+        return [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
     except Exception as e:
         logger.error(f"Error getting links from {url}: {str(e)}")
         return []
+def crawl_pages(base_url, max_depth):
+    visited = set()
+    to_visit = [(base_url, 0)]
+    all_pages = []
+    while to_visit:
+        current_url, depth = to_visit.pop(0)
+        if current_url in visited or depth > max_depth:
+            continue
+        visited.add(current_url)
+        content = get_page_content(current_url)
+        all_pages.append((current_url, content))
+        if depth < max_depth:
+            links = get_links(current_url)
+            to_visit.extend((link, depth + 1) for link in links if link not in visited)
+    return all_pages
+def website_to_pdf(url, max_depth):
+    logger.info(f"Starting to process: {url} with max depth: {max_depth}")
+    all_pages = crawl_pages(url, max_depth)
+    logger.info(f"Found {len(all_pages)} pages to process")
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
+    for page_url, content in all_pages:
         pdf.cell(0, 10, txt=page_url, ln=True)
         pdf.ln(5)
         for text in content:
     return pdf_path
+def process_url(url, depth):
     try:
+        pdf_file = website_to_pdf(url, depth)
         return pdf_file
     except Exception as e:
         logger.error(f"Error in process_url: {str(e)}")
 iface = gr.Interface(
     fn=process_url,
+    inputs=[
+        gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
+        gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Crawl Depth")
+    ],
     outputs=gr.File(label="Download PDF"),
     title="Gradio Documentation to PDF Converter",
+    description="Enter the Gradio docs URL and crawl depth to convert documentation pages into a PDF."
 )
 if __name__ == "__main__":