File size: 2,598 Bytes
6ecf729
 
 
 
de0ffde
 
 
56c5685
6ecf729
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de0ffde
 
 
 
56c5685
de0ffde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ecf729
56c5685
de0ffde
56c5685
6ecf729
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from io import BytesIO
import tempfile

def get_subdirectory_pages(url, base_url, visited=set()):
    if url in visited:
        return []
    
    visited.add(url)
    pages = [url]
    
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        for link in soup.find_all('a'):
            href = link.get('href')
            if href:
                full_url = urljoin(base_url, href)
                if full_url.startswith(base_url) and full_url not in visited:
                    pages.extend(get_subdirectory_pages(full_url, base_url, visited))
    except Exception as e:
        print(f"Error processing {url}: {e}")
    
    return pages

def website_to_pdf(url):
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
    all_pages = get_subdirectory_pages(base_url, base_url)
    
    buffer = BytesIO()
    c = canvas.Canvas(buffer, pagesize=letter)
    width, height = letter

    for page_url in all_pages:
        c.setFont("Helvetica", 12)
        c.drawString(30, height - 30, page_url)
        c.setFont("Helvetica", 10)
        try:
            response = requests.get(page_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text()
            lines = text.split('\n')
            y = height - 50
            for line in lines:
                if y < 30:
                    c.showPage()
                    y = height - 30
                c.drawString(30, y, line[:80])
                y -= 12
            c.showPage()
        except Exception as e:
            print(f"Error processing {page_url}: {e}")
    
    c.save()
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
        tmp.write(buffer.getvalue())
        output_file = tmp.name
    
    return output_file

def process_url(url):
    try:
        pdf_file = website_to_pdf(url)
        return pdf_file
    except Exception as e:
        return f"An error occurred: {str(e)}"

iface = gr.Interface(
    fn=process_url,
    inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
    outputs=gr.File(label="Download PDF"),
    title="Website Subdirectory to PDF Converter",
    description="Enter a website URL to convert its subdirectories into a PDF."
)

if __name__ == "__main__":
    iface.launch()