bluenevus commited on
Commit
2ad1f6f
·
verified ·
1 Parent(s): 04286a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -37,13 +37,17 @@ def get_page_content(url):
37
 
38
  def get_all_doc_links(url):
39
  try:
 
40
  response = requests.get(url, timeout=10)
41
  response.raise_for_status()
42
  soup = BeautifulSoup(response.text, 'html.parser')
43
  main_content = soup.find('main')
44
  if main_content:
45
  links = main_content.find_all('a', href=True)
46
- return [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
 
 
 
47
  return []
48
  except Exception as e:
49
  logger.error(f"Error getting links from {url}: {str(e)}")
@@ -58,14 +62,18 @@ def website_to_pdf(url):
58
  pdf = FPDF()
59
  pdf.set_auto_page_break(auto=True, margin=15)
60
  pdf.add_page()
61
- pdf.set_font("Arial", size=12)
 
62
 
63
  for page_url in all_links:
64
  content = get_page_content(page_url)
65
  pdf.cell(0, 10, txt=page_url, ln=True)
66
  pdf.ln(5)
67
  for text in content:
68
- pdf.multi_cell(0, 10, txt=text)
 
 
 
69
  pdf.add_page()
70
 
71
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
 
37
 
38
  def get_all_doc_links(url):
39
  try:
40
+ logger.info(f"Fetching links from: {url}")
41
  response = requests.get(url, timeout=10)
42
  response.raise_for_status()
43
  soup = BeautifulSoup(response.text, 'html.parser')
44
  main_content = soup.find('main')
45
  if main_content:
46
  links = main_content.find_all('a', href=True)
47
+ doc_links = [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
48
+ logger.info(f"Found {len(doc_links)} documentation links")
49
+ return doc_links
50
+ logger.warning("No main content found on the page")
51
  return []
52
  except Exception as e:
53
  logger.error(f"Error getting links from {url}: {str(e)}")
 
62
  pdf = FPDF()
63
  pdf.set_auto_page_break(auto=True, margin=15)
64
  pdf.add_page()
65
+ pdf.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
66
+ pdf.set_font('DejaVu', size=12)
67
 
68
  for page_url in all_links:
69
  content = get_page_content(page_url)
70
  pdf.cell(0, 10, txt=page_url, ln=True)
71
  pdf.ln(5)
72
  for text in content:
73
+ try:
74
+ pdf.multi_cell(0, 10, txt=text)
75
+ except Exception as e:
76
+ logger.error(f"Error writing text to PDF: {str(e)}")
77
  pdf.add_page()
78
 
79
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: