bluenevus commited on
Commit
6ecf729
·
verified ·
1 Parent(s): 410b5fe

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin, urlparse
5
+ import pdfkit
6
+ import os
7
+
8
+ def get_subdirectory_pages(url, base_url, visited=set()):
9
+ if url in visited:
10
+ return []
11
+
12
+ visited.add(url)
13
+ pages = [url]
14
+
15
+ try:
16
+ response = requests.get(url)
17
+ soup = BeautifulSoup(response.text, 'html.parser')
18
+
19
+ for link in soup.find_all('a'):
20
+ href = link.get('href')
21
+ if href:
22
+ full_url = urljoin(base_url, href)
23
+ if full_url.startswith(base_url) and full_url not in visited:
24
+ pages.extend(get_subdirectory_pages(full_url, base_url, visited))
25
+ except Exception as e:
26
+ print(f"Error processing {url}: {e}")
27
+
28
+ return pages
29
+
30
+ def website_to_pdf(url):
31
+ parsed_url = urlparse(url)
32
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
33
+ all_pages = get_subdirectory_pages(base_url, base_url)
34
+
35
+ options = {
36
+ 'page-size': 'A4',
37
+ 'margin-top': '0.75in',
38
+ 'margin-right': '0.75in',
39
+ 'margin-bottom': '0.75in',
40
+ 'margin-left': '0.75in',
41
+ }
42
+
43
+ output_file = "subdirectory_documentation.pdf"
44
+ pdfkit.from_url(all_pages, output_file, options=options)
45
+
46
+ return output_file
47
+
48
+ def process_url(url):
49
+ try:
50
+ pdf_file = website_to_pdf(url)
51
+ return pdf_file
52
+ except Exception as e:
53
+ return f"An error occurred: {str(e)}"
54
+
55
+ iface = gr.Interface(
56
+ fn=process_url,
57
+ inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
58
+ outputs=gr.File(label="Download PDF"),
59
+ title="Website Subdirectory to PDF Converter",
60
+ description="Enter a website URL to convert its subdirectories into a PDF."
61
+ )
62
+
63
+ if __name__ == "__main__":
64
+ iface.launch()