bluenevus commited on
Commit
56c5685
·
verified ·
1 Parent(s): 78e2d60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -2,7 +2,8 @@ import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
- import pdfkit
 
6
  import os
7
 
8
  def get_subdirectory_pages(url, base_url, visited=set()):
@@ -32,16 +33,16 @@ def website_to_pdf(url):
32
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
33
  all_pages = get_subdirectory_pages(base_url, base_url)
34
 
35
- options = {
36
- 'page-size': 'A4',
37
- 'margin-top': '0.75in',
38
- 'margin-right': '0.75in',
39
- 'margin-bottom': '0.75in',
40
- 'margin-left': '0.75in',
41
- }
42
 
43
- output_file = "subdirectory_documentation.pdf"
44
- pdfkit.from_url(all_pages, output_file, options=options)
 
45
 
46
  return output_file
47
 
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
+ from weasyprint import HTML
6
+ import tempfile
7
  import os
8
 
9
  def get_subdirectory_pages(url, base_url, visited=set()):
 
33
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
34
  all_pages = get_subdirectory_pages(base_url, base_url)
35
 
36
+ combined_html = "<html><body>"
37
+ for page_url in all_pages:
38
+ response = requests.get(page_url)
39
+ combined_html += f"<h1>{page_url}</h1>"
40
+ combined_html += response.text
41
+ combined_html += "</body></html>"
 
42
 
43
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
44
+ HTML(string=combined_html).write_pdf(tmp.name)
45
+ output_file = tmp.name
46
 
47
  return output_file
48