Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
|
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
from reportlab.pdfgen import canvas
|
6 |
from reportlab.lib.pagesizes import letter
|
|
|
|
|
7 |
from io import BytesIO
|
8 |
import tempfile
|
9 |
|
@@ -35,30 +37,26 @@ def website_to_pdf(url):
|
|
35 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
36 |
|
37 |
buffer = BytesIO()
|
38 |
-
|
39 |
-
|
|
|
40 |
|
41 |
for page_url in all_pages:
|
42 |
-
|
43 |
-
c.drawString(30, height - 30, page_url)
|
44 |
-
c.setFont("Helvetica", 10)
|
45 |
try:
|
46 |
response = requests.get(page_url)
|
47 |
soup = BeautifulSoup(response.text, 'html.parser')
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
c.drawString(30, y, line[:80])
|
56 |
-
y -= 12
|
57 |
-
c.showPage()
|
58 |
except Exception as e:
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
|
63 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
64 |
tmp.write(buffer.getvalue())
|
|
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
from reportlab.pdfgen import canvas
|
6 |
from reportlab.lib.pagesizes import letter
|
7 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
8 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph
|
9 |
from io import BytesIO
|
10 |
import tempfile
|
11 |
|
|
|
37 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
38 |
|
39 |
buffer = BytesIO()
|
40 |
+
doc = SimpleDocTemplate(buffer, pagesize=letter)
|
41 |
+
styles = getSampleStyleSheet()
|
42 |
+
story = []
|
43 |
|
44 |
for page_url in all_pages:
|
45 |
+
story.append(Paragraph(f"<b>{page_url}</b>", styles['Heading1']))
|
|
|
|
|
46 |
try:
|
47 |
response = requests.get(page_url)
|
48 |
soup = BeautifulSoup(response.text, 'html.parser')
|
49 |
+
|
50 |
+
# Extract text from specific tags
|
51 |
+
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
52 |
+
for element in soup.find_all(tag):
|
53 |
+
text = element.get_text(strip=True)
|
54 |
+
if text:
|
55 |
+
story.append(Paragraph(text, styles['BodyText']))
|
|
|
|
|
|
|
56 |
except Exception as e:
|
57 |
+
story.append(Paragraph(f"Error processing {page_url}: {str(e)}", styles['BodyText']))
|
58 |
+
|
59 |
+
doc.build(story)
|
60 |
|
61 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
62 |
tmp.write(buffer.getvalue())
|