bluenevus commited on
Commit
de0ffde
·
verified ·
1 Parent(s): abae1a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -8
app.py CHANGED
@@ -2,9 +2,10 @@ import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
- from weasyprint import HTML
 
 
6
  import tempfile
7
- import os
8
 
9
  def get_subdirectory_pages(url, base_url, visited=set()):
10
  if url in visited:
@@ -33,15 +34,34 @@ def website_to_pdf(url):
33
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
34
  all_pages = get_subdirectory_pages(base_url, base_url)
35
 
36
- combined_html = "<html><body>"
 
 
 
37
  for page_url in all_pages:
38
- response = requests.get(page_url)
39
- combined_html += f"<h1>{page_url}</h1>"
40
- combined_html += response.text
41
- combined_html += "</body></html>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
44
- HTML(string=combined_html).write_pdf(tmp.name)
45
  output_file = tmp.name
46
 
47
  return output_file
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
+ from reportlab.pdfgen import canvas
6
+ from reportlab.lib.pagesizes import letter
7
+ from io import BytesIO
8
  import tempfile
 
9
 
10
  def get_subdirectory_pages(url, base_url, visited=set()):
11
  if url in visited:
 
34
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
35
  all_pages = get_subdirectory_pages(base_url, base_url)
36
 
37
+ buffer = BytesIO()
38
+ c = canvas.Canvas(buffer, pagesize=letter)
39
+ width, height = letter
40
+
41
  for page_url in all_pages:
42
+ c.setFont("Helvetica", 12)
43
+ c.drawString(30, height - 30, page_url)
44
+ c.setFont("Helvetica", 10)
45
+ try:
46
+ response = requests.get(page_url)
47
+ soup = BeautifulSoup(response.text, 'html.parser')
48
+ text = soup.get_text()
49
+ lines = text.split('\n')
50
+ y = height - 50
51
+ for line in lines:
52
+ if y < 30:
53
+ c.showPage()
54
+ y = height - 30
55
+ c.drawString(30, y, line[:80])
56
+ y -= 12
57
+ c.showPage()
58
+ except Exception as e:
59
+ print(f"Error processing {page_url}: {e}")
60
+
61
+ c.save()
62
 
63
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
64
+ tmp.write(buffer.getvalue())
65
  output_file = tmp.name
66
 
67
  return output_file