website-to-pdf / app.py
bluenevus's picture
Create app.py
6ecf729 verified
raw
history blame
1.86 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pdfkit
import os
def get_subdirectory_pages(url, base_url, visited=set()):
if url in visited:
return []
visited.add(url)
pages = [url]
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(base_url, href)
if full_url.startswith(base_url) and full_url not in visited:
pages.extend(get_subdirectory_pages(full_url, base_url, visited))
except Exception as e:
print(f"Error processing {url}: {e}")
return pages
def website_to_pdf(url):
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
all_pages = get_subdirectory_pages(base_url, base_url)
options = {
'page-size': 'A4',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
}
output_file = "subdirectory_documentation.pdf"
pdfkit.from_url(all_pages, output_file, options=options)
return output_file
def process_url(url):
try:
pdf_file = website_to_pdf(url)
return pdf_file
except Exception as e:
return f"An error occurred: {str(e)}"
iface = gr.Interface(
fn=process_url,
inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
outputs=gr.File(label="Download PDF"),
title="Website Subdirectory to PDF Converter",
description="Enter a website URL to convert its subdirectories into a PDF."
)
if __name__ == "__main__":
iface.launch()