Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.pagesizes import letter | |
from io import BytesIO | |
import tempfile | |
def get_subdirectory_pages(url, base_url, visited=set()): | |
if url in visited: | |
return [] | |
visited.add(url) | |
pages = [url] | |
try: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
if href: | |
full_url = urljoin(base_url, href) | |
if full_url.startswith(base_url) and full_url not in visited: | |
pages.extend(get_subdirectory_pages(full_url, base_url, visited)) | |
except Exception as e: | |
print(f"Error processing {url}: {e}") | |
return pages | |
def website_to_pdf(url): | |
parsed_url = urlparse(url) | |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/" | |
all_pages = get_subdirectory_pages(base_url, base_url) | |
buffer = BytesIO() | |
c = canvas.Canvas(buffer, pagesize=letter) | |
width, height = letter | |
for page_url in all_pages: | |
c.setFont("Helvetica", 12) | |
c.drawString(30, height - 30, page_url) | |
c.setFont("Helvetica", 10) | |
try: | |
response = requests.get(page_url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
text = soup.get_text() | |
lines = text.split('\n') | |
y = height - 50 | |
for line in lines: | |
if y < 30: | |
c.showPage() | |
y = height - 30 | |
c.drawString(30, y, line[:80]) | |
y -= 12 | |
c.showPage() | |
except Exception as e: | |
print(f"Error processing {page_url}: {e}") | |
c.save() | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: | |
tmp.write(buffer.getvalue()) | |
output_file = tmp.name | |
return output_file | |
def process_url(url): | |
try: | |
pdf_file = website_to_pdf(url) | |
return pdf_file | |
except Exception as e: | |
return f"An error occurred: {str(e)}" | |
iface = gr.Interface( | |
fn=process_url, | |
inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"), | |
outputs=gr.File(label="Download PDF"), | |
title="Website Subdirectory to PDF Converter", | |
description="Enter a website URL to convert its subdirectories into a PDF." | |
) | |
if __name__ == "__main__": | |
iface.launch() |