bluenevus commited on
Commit
0f462a3
·
verified ·
1 Parent(s): 9349dce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -18
app.py CHANGED
@@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
  from reportlab.pdfgen import canvas
6
  from reportlab.lib.pagesizes import letter
 
 
7
  from io import BytesIO
8
  import tempfile
9
 
@@ -35,30 +37,26 @@ def website_to_pdf(url):
35
  all_pages = get_subdirectory_pages(base_url, base_url)
36
 
37
  buffer = BytesIO()
38
- c = canvas.Canvas(buffer, pagesize=letter)
39
- width, height = letter
 
40
 
41
  for page_url in all_pages:
42
- c.setFont("Helvetica", 12)
43
- c.drawString(30, height - 30, page_url)
44
- c.setFont("Helvetica", 10)
45
  try:
46
  response = requests.get(page_url)
47
  soup = BeautifulSoup(response.text, 'html.parser')
48
- text = soup.get_text()
49
- lines = text.split('\n')
50
- y = height - 50
51
- for line in lines:
52
- if y < 30:
53
- c.showPage()
54
- y = height - 30
55
- c.drawString(30, y, line[:80])
56
- y -= 12
57
- c.showPage()
58
  except Exception as e:
59
- print(f"Error processing {page_url}: {e}")
60
-
61
- c.save()
62
 
63
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
64
  tmp.write(buffer.getvalue())
 
4
  from urllib.parse import urljoin, urlparse
5
  from reportlab.pdfgen import canvas
6
  from reportlab.lib.pagesizes import letter
7
+ from reportlab.lib.styles import getSampleStyleSheet
8
+ from reportlab.platypus import SimpleDocTemplate, Paragraph
9
  from io import BytesIO
10
  import tempfile
11
 
 
37
  all_pages = get_subdirectory_pages(base_url, base_url)
38
 
39
  buffer = BytesIO()
40
+ doc = SimpleDocTemplate(buffer, pagesize=letter)
41
+ styles = getSampleStyleSheet()
42
+ story = []
43
 
44
  for page_url in all_pages:
45
+ story.append(Paragraph(f"<b>{page_url}</b>", styles['Heading1']))
 
 
46
  try:
47
  response = requests.get(page_url)
48
  soup = BeautifulSoup(response.text, 'html.parser')
49
+
50
+ # Extract text from specific tags
51
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
52
+ for element in soup.find_all(tag):
53
+ text = element.get_text(strip=True)
54
+ if text:
55
+ story.append(Paragraph(text, styles['BodyText']))
 
 
 
56
  except Exception as e:
57
+ story.append(Paragraph(f"Error processing {page_url}: {str(e)}", styles['BodyText']))
58
+
59
+ doc.build(story)
60
 
61
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
62
  tmp.write(buffer.getvalue())