bluenevus commited on
Commit
9f222f2
·
verified ·
1 Parent(s): 12928b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -18
app.py CHANGED
@@ -2,11 +2,16 @@ import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
- from reportlab.lib.pagesizes import letter
6
- from reportlab.lib.styles import getSampleStyleSheet
7
- from reportlab.platypus import SimpleDocTemplate, Paragraph
8
- from io import BytesIO
9
  import tempfile
 
 
 
 
 
 
 
 
10
 
11
  def get_page_content(url):
12
  try:
@@ -15,15 +20,15 @@ def get_page_content(url):
15
  content = []
16
  for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
17
  for element in soup.find_all(tag):
18
- text = element.get_text(strip=True)
19
  if text:
20
  content.append(text)
21
  return content
22
  except Exception as e:
23
  return [f"Error processing {url}: {str(e)}"]
24
 
25
- def get_subdirectory_pages(url, base_url, visited=set()):
26
- if url in visited:
27
  return []
28
 
29
  visited.add(url)
@@ -38,7 +43,9 @@ def get_subdirectory_pages(url, base_url, visited=set()):
38
  if href:
39
  full_url = urljoin(base_url, href)
40
  if full_url.startswith(base_url) and full_url not in visited:
41
- pages.extend(get_subdirectory_pages(full_url, base_url, visited))
 
 
42
  except Exception as e:
43
  print(f"Error processing {url}: {e}")
44
 
@@ -49,21 +56,18 @@ def website_to_pdf(url):
49
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
50
  all_pages = get_subdirectory_pages(base_url, base_url)
51
 
52
- buffer = BytesIO()
53
- doc = SimpleDocTemplate(buffer, pagesize=letter)
54
- styles = getSampleStyleSheet()
55
- story = []
56
 
57
  for page_url, content in all_pages:
58
- story.append(Paragraph(f"<b>{page_url}</b>", styles['Heading1']))
59
  for text in content:
60
- story.append(Paragraph(text, styles['BodyText']))
61
- story.append(Paragraph("<br/><br/>", styles['BodyText']))
62
 
63
- doc.build(story)
64
-
65
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
66
- tmp.write(buffer.getvalue())
67
  output_file = tmp.name
68
 
69
  return output_file
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin, urlparse
5
+ from fpdf import FPDF
 
 
 
6
  import tempfile
7
+ import re
8
+
9
+ def clean_text(text):
10
+ # Remove any non-printable characters
11
+ text = ''.join(char for char in text if char.isprintable())
12
+ # Replace any remaining problematic characters
13
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
14
+ return text
15
 
16
  def get_page_content(url):
17
  try:
 
20
  content = []
21
  for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
22
  for element in soup.find_all(tag):
23
+ text = clean_text(element.get_text(strip=True))
24
  if text:
25
  content.append(text)
26
  return content
27
  except Exception as e:
28
  return [f"Error processing {url}: {str(e)}"]
29
 
30
+ def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
31
+ if url in visited or len(visited) >= max_pages:
32
  return []
33
 
34
  visited.add(url)
 
43
  if href:
44
  full_url = urljoin(base_url, href)
45
  if full_url.startswith(base_url) and full_url not in visited:
46
+ pages.extend(get_subdirectory_pages(full_url, base_url, visited, max_pages))
47
+ if len(visited) >= max_pages:
48
+ break
49
  except Exception as e:
50
  print(f"Error processing {url}: {e}")
51
 
 
56
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
57
  all_pages = get_subdirectory_pages(base_url, base_url)
58
 
59
+ pdf = FPDF()
60
+ pdf.add_page()
61
+ pdf.set_font("Arial", size=12)
 
62
 
63
  for page_url, content in all_pages:
64
+ pdf.cell(200, 10, txt=page_url, ln=True)
65
  for text in content:
66
+ pdf.multi_cell(0, 10, txt=text)
67
+ pdf.add_page()
68
 
 
 
69
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
70
+ pdf.output(tmp.name)
71
  output_file = tmp.name
72
 
73
  return output_file