Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -37,13 +37,17 @@ def get_page_content(url):
|
|
37 |
|
38 |
def get_all_doc_links(url):
|
39 |
try:
|
|
|
40 |
response = requests.get(url, timeout=10)
|
41 |
response.raise_for_status()
|
42 |
soup = BeautifulSoup(response.text, 'html.parser')
|
43 |
main_content = soup.find('main')
|
44 |
if main_content:
|
45 |
links = main_content.find_all('a', href=True)
|
46 |
-
|
|
|
|
|
|
|
47 |
return []
|
48 |
except Exception as e:
|
49 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
@@ -58,14 +62,18 @@ def website_to_pdf(url):
|
|
58 |
pdf = FPDF()
|
59 |
pdf.set_auto_page_break(auto=True, margin=15)
|
60 |
pdf.add_page()
|
61 |
-
pdf.
|
|
|
62 |
|
63 |
for page_url in all_links:
|
64 |
content = get_page_content(page_url)
|
65 |
pdf.cell(0, 10, txt=page_url, ln=True)
|
66 |
pdf.ln(5)
|
67 |
for text in content:
|
68 |
-
|
|
|
|
|
|
|
69 |
pdf.add_page()
|
70 |
|
71 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
|
|
37 |
|
38 |
def get_all_doc_links(url):
|
39 |
try:
|
40 |
+
logger.info(f"Fetching links from: {url}")
|
41 |
response = requests.get(url, timeout=10)
|
42 |
response.raise_for_status()
|
43 |
soup = BeautifulSoup(response.text, 'html.parser')
|
44 |
main_content = soup.find('main')
|
45 |
if main_content:
|
46 |
links = main_content.find_all('a', href=True)
|
47 |
+
doc_links = [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
|
48 |
+
logger.info(f"Found {len(doc_links)} documentation links")
|
49 |
+
return doc_links
|
50 |
+
logger.warning("No main content found on the page")
|
51 |
return []
|
52 |
except Exception as e:
|
53 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
|
|
62 |
pdf = FPDF()
|
63 |
pdf.set_auto_page_break(auto=True, margin=15)
|
64 |
pdf.add_page()
|
65 |
+
pdf.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
|
66 |
+
pdf.set_font('DejaVu', size=12)
|
67 |
|
68 |
for page_url in all_links:
|
69 |
content = get_page_content(page_url)
|
70 |
pdf.cell(0, 10, txt=page_url, ln=True)
|
71 |
pdf.ln(5)
|
72 |
for text in content:
|
73 |
+
try:
|
74 |
+
pdf.multi_cell(0, 10, txt=text)
|
75 |
+
except Exception as e:
|
76 |
+
logger.error(f"Error writing text to PDF: {str(e)}")
|
77 |
pdf.add_page()
|
78 |
|
79 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|