Spaces:

MicroHealth
/

website-to-pdf

Sleeping

App Files Files Community

bluenevus commited on Apr 23

Commit

156e042

verified ·

1 Parent(s): a8a1bcb

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -25

app.py CHANGED Viewed

@@ -55,13 +55,24 @@ async def get_page_content(session, url):
                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     content = []
-                    main_content = soup.find('main') or soup.find('div', class_='content') or soup
                     if main_content:
-                        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code']:
-                            for element in main_content.find_all(tag):
-                                text = clean_text(element.get_text(strip=True))
-                                if text:
-                                    content.append(f"{tag.upper()}: {text}")
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
                 else:
@@ -70,7 +81,7 @@ async def get_page_content(session, url):
     except Exception as e:
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
 async def get_links(session, url, base_url):
     try:
         async with rate_limiter:
@@ -105,28 +116,14 @@ async def crawl_pages(base_url, max_depth):
                 continue
             visited.add(current_url)
-            start_time = time.time()
-            with get_db_connection() as conn:
-                c = conn.cursor()
-                c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
-                result = c.fetchone()
-            if result:
-                content = eval(result[0])  # Convert string back to list
-            else:
-                content = await get_page_content(session, current_url)
-                with get_db_connection() as conn:
-                    c = conn.cursor()
-                    c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
-                    conn.commit()
             all_pages.append((current_url, content))
-            logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
             if depth < max_depth:
-                links = await get_links(session, current_url, base_url)
-                for link in links:
                     if link not in visited and link not in [url for url, _ in to_visit]:
                         to_visit.append((link, depth + 1))

                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     content = []
+                    # Look for the main content area
+                    main_content = soup.find('div', class_='toc')
                     if main_content:
+                        # Extract section titles and links
+                        for section in main_content.find_all('div', class_='toc--section'):
+                            title = section.find('h2', class_='toc-title-border')
+                            if title:
+                                content.append(f"H2: {title.text.strip()}")
+                            links_store = section.find('div', id=lambda x: x and x.startswith("{'type': 'links-store'"))
+                            if links_store:
+                                links = links_store.find_next('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
+                                if links:
+                                    for link in links.find_all('a'):
+                                        content.append(f"LINK: {link.text.strip()} - {urljoin(url, link['href'])}")
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
                 else:
     except Exception as e:
         logger.error(f"Error processing {url}: {str(e)}")
         return [f"Error processing {url}: {str(e)}"]
 async def get_links(session, url, base_url):
     try:
         async with rate_limiter:
                 continue
             visited.add(current_url)
+            logger.info(f"Crawling: {current_url} at depth {depth}")
+            content = await get_page_content(session, current_url)
             all_pages.append((current_url, content))
             if depth < max_depth:
+                new_links = [link.split(' - ')[1] for link in content if link.startswith('LINK:')]
+                for link in new_links:
                     if link not in visited and link not in [url for url, _ in to_visit]:
                         to_visit.append((link, depth + 1))