Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,8 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
-
import
|
|
|
6 |
import os
|
7 |
|
8 |
def get_subdirectory_pages(url, base_url, visited=set()):
|
@@ -32,16 +33,16 @@ def website_to_pdf(url):
|
|
32 |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
|
33 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
}
|
42 |
|
43 |
-
|
44 |
-
|
|
|
45 |
|
46 |
return output_file
|
47 |
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
+
from weasyprint import HTML
|
6 |
+
import tempfile
|
7 |
import os
|
8 |
|
9 |
def get_subdirectory_pages(url, base_url, visited=set()):
|
|
|
33 |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
|
34 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
35 |
|
36 |
+
combined_html = "<html><body>"
|
37 |
+
for page_url in all_pages:
|
38 |
+
response = requests.get(page_url)
|
39 |
+
combined_html += f"<h1>{page_url}</h1>"
|
40 |
+
combined_html += response.text
|
41 |
+
combined_html += "</body></html>"
|
|
|
42 |
|
43 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
44 |
+
HTML(string=combined_html).write_pdf(tmp.name)
|
45 |
+
output_file = tmp.name
|
46 |
|
47 |
return output_file
|
48 |
|