Spaces:

MicroHealth
/

website-to-pdf

Running

App Files Files Community

bluenevus commited on Apr 23

Commit

40c0a08

verified ·

1 Parent(s): 156e042

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -9

app.py CHANGED Viewed

@@ -89,13 +89,21 @@ async def get_links(session, url, base_url):
                 if response.status == 200:
                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
-                    links = soup.find_all('a', href=True)
                     valid_links = []
-                    for link in links:
-                        href = link['href']
-                        full_url = urljoin(base_url, href)
-                        if full_url.startswith(base_url) and full_url != url:
-                            valid_links.append(full_url)
                     return valid_links
                 else:
                     logger.error(f"Error fetching links from {url}: HTTP {response.status}")
@@ -116,14 +124,15 @@ async def crawl_pages(base_url, max_depth):
                 continue
             visited.add(current_url)
-            logger.info(f"Crawling: {current_url} at depth {depth}")
             content = await get_page_content(session, current_url)
             all_pages.append((current_url, content))
             if depth < max_depth:
-                new_links = [link.split(' - ')[1] for link in content if link.startswith('LINK:')]
-                for link in new_links:
                     if link not in visited and link not in [url for url, _ in to_visit]:
                         to_visit.append((link, depth + 1))

                 if response.status == 200:
                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     valid_links = []
+                    # Look for the main content area
+                    main_content = soup.find('div', class_='toc')
+                    if main_content:
+                        # Find all link containers
+                        link_containers = main_content.find_all('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
+                        for container in link_containers:
+                            for link in container.find_all('a', href=True):
+                                full_url = urljoin(base_url, link['href'])
+                                if full_url.startswith(base_url) and full_url != url:
+                                    valid_links.append(full_url)
                     return valid_links
                 else:
                     logger.error(f"Error fetching links from {url}: HTTP {response.status}")
                 continue
             visited.add(current_url)
+            start_time = time.time()
             content = await get_page_content(session, current_url)
             all_pages.append((current_url, content))
+            logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
             if depth < max_depth:
+                links = await get_links(session, current_url, base_url)
+                for link in links:
                     if link not in visited and link not in [url for url, _ in to_visit]:
                         to_visit.append((link, depth + 1))