bluenevus commited on
Commit
12928b4
·
verified ·
1 Parent(s): 0f462a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -15
app.py CHANGED
@@ -2,19 +2,32 @@ import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
- from reportlab.pdfgen import canvas
6
  from reportlab.lib.pagesizes import letter
7
  from reportlab.lib.styles import getSampleStyleSheet
8
  from reportlab.platypus import SimpleDocTemplate, Paragraph
9
  from io import BytesIO
10
  import tempfile
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def get_subdirectory_pages(url, base_url, visited=set()):
13
  if url in visited:
14
  return []
15
 
16
  visited.add(url)
17
- pages = [url]
18
 
19
  try:
20
  response = requests.get(url)
@@ -41,20 +54,11 @@ def website_to_pdf(url):
41
  styles = getSampleStyleSheet()
42
  story = []
43
 
44
- for page_url in all_pages:
45
  story.append(Paragraph(f"<b>{page_url}</b>", styles['Heading1']))
46
- try:
47
- response = requests.get(page_url)
48
- soup = BeautifulSoup(response.text, 'html.parser')
49
-
50
- # Extract text from specific tags
51
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
52
- for element in soup.find_all(tag):
53
- text = element.get_text(strip=True)
54
- if text:
55
- story.append(Paragraph(text, styles['BodyText']))
56
- except Exception as e:
57
- story.append(Paragraph(f"Error processing {page_url}: {str(e)}", styles['BodyText']))
58
 
59
  doc.build(story)
60
 
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
 
5
  from reportlab.lib.pagesizes import letter
6
  from reportlab.lib.styles import getSampleStyleSheet
7
  from reportlab.platypus import SimpleDocTemplate, Paragraph
8
  from io import BytesIO
9
  import tempfile
10
 
11
+ def get_page_content(url):
12
+ try:
13
+ response = requests.get(url)
14
+ soup = BeautifulSoup(response.text, 'html.parser')
15
+ content = []
16
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
17
+ for element in soup.find_all(tag):
18
+ text = element.get_text(strip=True)
19
+ if text:
20
+ content.append(text)
21
+ return content
22
+ except Exception as e:
23
+ return [f"Error processing {url}: {str(e)}"]
24
+
25
  def get_subdirectory_pages(url, base_url, visited=set()):
26
  if url in visited:
27
  return []
28
 
29
  visited.add(url)
30
+ pages = [(url, get_page_content(url))]
31
 
32
  try:
33
  response = requests.get(url)
 
54
  styles = getSampleStyleSheet()
55
  story = []
56
 
57
+ for page_url, content in all_pages:
58
  story.append(Paragraph(f"<b>{page_url}</b>", styles['Heading1']))
59
+ for text in content:
60
+ story.append(Paragraph(text, styles['BodyText']))
61
+ story.append(Paragraph("<br/><br/>", styles['BodyText']))
 
 
 
 
 
 
 
 
 
62
 
63
  doc.build(story)
64