Spaces:

MicroHealth
/

website-to-pdf

Sleeping

File size: 3,144 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from fpdf import FPDF
import tempfile
import re
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def clean_text(text):
    text = ''.join(char for char in text if char.isprintable())
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text

def get_page_content(url):
    try:
        logger.info(f"Fetching content from: {url}")
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        content = []
        main_content = soup.find('main')
        if main_content:
            for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
                for element in main_content.find_all(tag):
                    text = clean_text(element.get_text(strip=True))
                    if text:
                        content.append(text)
        logger.info(f"Found {len(content)} content items for {url}")
        return content
    except Exception as e:
        logger.error(f"Error processing {url}: {str(e)}")
        return [f"Error processing {url}: {str(e)}"]

def get_all_doc_links(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('main')
        if main_content:
            links = main_content.find_all('a', href=True)
            return [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
        return []
    except Exception as e:
        logger.error(f"Error getting links from {url}: {str(e)}")
        return []

def website_to_pdf(url):
    base_url = "https://www.gradio.app/docs"
    logger.info(f"Starting to process: {base_url}")
    all_links = get_all_doc_links(base_url)
    logger.info(f"Found {len(all_links)} pages to process")
    
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    for page_url in all_links:
        content = get_page_content(page_url)
        pdf.cell(0, 10, txt=page_url, ln=True)
        pdf.ln(5)
        for text in content:
            pdf.multi_cell(0, 10, txt=text)
        pdf.add_page()

    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
        pdf_path = tmp.name
        pdf.output(pdf_path)
        logger.info(f"PDF saved to: {pdf_path}")
    
    return pdf_path

def process_url(url):
    try:
        pdf_file = website_to_pdf(url)
        return pdf_file
    except Exception as e:
        logger.error(f"Error in process_url: {str(e)}")
        return f"An error occurred: {str(e)}"

iface = gr.Interface(
    fn=process_url,
    inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
    outputs=gr.File(label="Download PDF"),
    title="Gradio Documentation to PDF Converter",
    description="Enter the Gradio docs URL to convert all documentation pages into a PDF."
)

if __name__ == "__main__":
    iface.launch()