Spaces:

MicroHealth
/

website-to-pdf

Running

App Files Files Community

bluenevus commited on Apr 23

Commit

ead5062

verified ·

1 Parent(s): 40c0a08

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -23

app.py CHANGED Viewed

@@ -57,21 +57,14 @@ async def get_page_content(session, url):
                     content = []
                     # Look for the main content area
-                    main_content = soup.find('div', class_='toc')
                     if main_content:
-                        # Extract section titles and links
-                        for section in main_content.find_all('div', class_='toc--section'):
-                            title = section.find('h2', class_='toc-title-border')
-                            if title:
-                                content.append(f"H2: {title.text.strip()}")
-                            links_store = section.find('div', id=lambda x: x and x.startswith("{'type': 'links-store'"))
-                            if links_store:
-                                links = links_store.find_next('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
-                                if links:
-                                    for link in links.find_all('a'):
-                                        content.append(f"LINK: {link.text.strip()} - {urljoin(url, link['href'])}")
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
@@ -81,7 +74,7 @@ async def get_page_content(session, url):
     except Exception as e:
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
 async def get_links(session, url, base_url):
     try:
         async with rate_limiter:
@@ -92,17 +85,14 @@ async def get_links(session, url, base_url):
                     valid_links = []
                     # Look for the main content area
-                    main_content = soup.find('div', class_='toc')
                     if main_content:
-                        # Find all link containers
-                        link_containers = main_content.find_all('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
-                        for container in link_containers:
-                            for link in container.find_all('a', href=True):
-                                full_url = urljoin(base_url, link['href'])
-                                if full_url.startswith(base_url) and full_url != url:
-                                    valid_links.append(full_url)
                     return valid_links
                 else:

                     content = []
                     # Look for the main content area
+                    main_content = soup.find('div', id='react-entry-point')
                     if main_content:
+                        # Extract all text content
+                        for tag in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code']):
+                            text = clean_text(tag.get_text(strip=True))
+                            if text:
+                                content.append(f"{tag.name.upper()}: {text}")
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
     except Exception as e:
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
 async def get_links(session, url, base_url):
     try:
         async with rate_limiter:
                     valid_links = []
                     # Look for the main content area
+                    main_content = soup.find('div', id='react-entry-point')
                     if main_content:
+                        for link in main_content.find_all('a', href=True):
+                            href = link['href']
+                            full_url = urljoin(base_url, href)
+                            if full_url.startswith(base_url) and full_url != url:
+                                valid_links.append(full_url)
                     return valid_links
                 else: