website-to-pdf / app.py
bluenevus's picture
Update app.py
de0ffde verified
raw
history blame
2.6 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from io import BytesIO
import tempfile
def get_subdirectory_pages(url, base_url, visited=set()):
if url in visited:
return []
visited.add(url)
pages = [url]
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(base_url, href)
if full_url.startswith(base_url) and full_url not in visited:
pages.extend(get_subdirectory_pages(full_url, base_url, visited))
except Exception as e:
print(f"Error processing {url}: {e}")
return pages
def website_to_pdf(url):
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
all_pages = get_subdirectory_pages(base_url, base_url)
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
width, height = letter
for page_url in all_pages:
c.setFont("Helvetica", 12)
c.drawString(30, height - 30, page_url)
c.setFont("Helvetica", 10)
try:
response = requests.get(page_url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
lines = text.split('\n')
y = height - 50
for line in lines:
if y < 30:
c.showPage()
y = height - 30
c.drawString(30, y, line[:80])
y -= 12
c.showPage()
except Exception as e:
print(f"Error processing {page_url}: {e}")
c.save()
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
tmp.write(buffer.getvalue())
output_file = tmp.name
return output_file
def process_url(url):
try:
pdf_file = website_to_pdf(url)
return pdf_file
except Exception as e:
return f"An error occurred: {str(e)}"
iface = gr.Interface(
fn=process_url,
inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
outputs=gr.File(label="Download PDF"),
title="Website Subdirectory to PDF Converter",
description="Enter a website URL to convert its subdirectories into a PDF."
)
if __name__ == "__main__":
iface.launch()