Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,9 +2,10 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
-
from
|
|
|
|
|
6 |
import tempfile
|
7 |
-
import os
|
8 |
|
9 |
def get_subdirectory_pages(url, base_url, visited=set()):
|
10 |
if url in visited:
|
@@ -33,15 +34,34 @@ def website_to_pdf(url):
|
|
33 |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
|
34 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
35 |
|
36 |
-
|
|
|
|
|
|
|
37 |
for page_url in all_pages:
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
44 |
-
|
45 |
output_file = tmp.name
|
46 |
|
47 |
return output_file
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
+
from reportlab.pdfgen import canvas
|
6 |
+
from reportlab.lib.pagesizes import letter
|
7 |
+
from io import BytesIO
|
8 |
import tempfile
|
|
|
9 |
|
10 |
def get_subdirectory_pages(url, base_url, visited=set()):
|
11 |
if url in visited:
|
|
|
34 |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
|
35 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
36 |
|
37 |
+
buffer = BytesIO()
|
38 |
+
c = canvas.Canvas(buffer, pagesize=letter)
|
39 |
+
width, height = letter
|
40 |
+
|
41 |
for page_url in all_pages:
|
42 |
+
c.setFont("Helvetica", 12)
|
43 |
+
c.drawString(30, height - 30, page_url)
|
44 |
+
c.setFont("Helvetica", 10)
|
45 |
+
try:
|
46 |
+
response = requests.get(page_url)
|
47 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
48 |
+
text = soup.get_text()
|
49 |
+
lines = text.split('\n')
|
50 |
+
y = height - 50
|
51 |
+
for line in lines:
|
52 |
+
if y < 30:
|
53 |
+
c.showPage()
|
54 |
+
y = height - 30
|
55 |
+
c.drawString(30, y, line[:80])
|
56 |
+
y -= 12
|
57 |
+
c.showPage()
|
58 |
+
except Exception as e:
|
59 |
+
print(f"Error processing {page_url}: {e}")
|
60 |
+
|
61 |
+
c.save()
|
62 |
|
63 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
64 |
+
tmp.write(buffer.getvalue())
|
65 |
output_file = tmp.name
|
66 |
|
67 |
return output_file
|