Spaces:

MicroHealth
/

website-to-pdf

Sleeping

bluenevus commited on Apr 23

Commit

a8a1bcb

verified ·

1 Parent(s): 3edbcab

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -55,13 +55,13 @@ async def get_page_content(session, url):
                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     content = []
-                    main_content = soup.find('article') or soup.find('main') or soup
                     if main_content:
-                        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
                             for element in main_content.find_all(tag):
                                 text = clean_text(element.get_text(strip=True))
                                 if text:
-                                    content.append(text)
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
                 else:
@@ -82,10 +82,7 @@ async def get_links(session, url, base_url):
                     valid_links = []
                     for link in links:
                         href = link['href']
-                        full_url = urljoin(url, href)
-                        # Check if the link is relative and doesn't start with '/'
-                        if not href.startswith('/') and not href.startswith('http'):
-                            full_url = f"{base_url}/{href}"
                         if full_url.startswith(base_url) and full_url != url:
                             valid_links.append(full_url)
                     return valid_links

                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     content = []
+                    main_content = soup.find('main') or soup.find('div', class_='content') or soup
                     if main_content:
+                        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code']:
                             for element in main_content.find_all(tag):
                                 text = clean_text(element.get_text(strip=True))
                                 if text:
+                                    content.append(f"{tag.upper()}: {text}")
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
                 else:
                     valid_links = []
                     for link in links:
                         href = link['href']
+                        full_url = urljoin(base_url, href)
                         if full_url.startswith(base_url) and full_url != url:
                             valid_links.append(full_url)
                     return valid_links