Spaces:

MicroHealth
/

website-to-pdf

Running

App Files Files Community

bluenevus commited on Apr 23

Commit

35d836c

verified ·

1 Parent(s): ead5062

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -23

app.py CHANGED Viewed

@@ -55,17 +55,13 @@ async def get_page_content(session, url):
                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     content = []
-                    # Look for the main content area
-                    main_content = soup.find('div', id='react-entry-point')
                     if main_content:
-                        # Extract all text content
-                        for tag in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code']):
-                            text = clean_text(tag.get_text(strip=True))
-                            if text:
-                                content.append(f"{tag.name.upper()}: {text}")
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
                 else:
@@ -82,18 +78,12 @@ async def get_links(session, url, base_url):
                 if response.status == 200:
                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     valid_links = []
-                    # Look for the main content area
-                    main_content = soup.find('div', id='react-entry-point')
-                    if main_content:
-                        for link in main_content.find_all('a', href=True):
-                            href = link['href']
-                            full_url = urljoin(base_url, href)
-                            if full_url.startswith(base_url) and full_url != url:
-                                valid_links.append(full_url)
                     return valid_links
                 else:
                     logger.error(f"Error fetching links from {url}: HTTP {response.status}")
@@ -116,14 +106,27 @@ async def crawl_pages(base_url, max_depth):
             visited.add(current_url)
             start_time = time.time()
-            content = await get_page_content(session, current_url)
             all_pages.append((current_url, content))
             logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
             if depth < max_depth:
                 links = await get_links(session, current_url, base_url)
                 for link in links:
-                    if link not in visited and link not in [url for url, _ in to_visit]:
                         to_visit.append((link, depth + 1))
     return all_pages

                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
                     content = []
+                    main_content = soup.find('article') or soup.find('main') or soup
                     if main_content:
+                        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
+                            for element in main_content.find_all(tag):
+                                text = clean_text(element.get_text(strip=True))
+                                if text:
+                                    content.append(text)
                     logger.info(f"Found {len(content)} content items for {url}")
                     return content
                 else:
                 if response.status == 200:
                     text = await response.text()
                     soup = BeautifulSoup(text, 'html.parser')
+                    links = soup.find_all('a', href=True)
                     valid_links = []
+                    for link in links:
+                        full_url = urljoin(url, link['href'])
+                        if full_url.startswith(base_url) and full_url != url:
+                            valid_links.append(full_url)
                     return valid_links
                 else:
                     logger.error(f"Error fetching links from {url}: HTTP {response.status}")
             visited.add(current_url)
             start_time = time.time()
+            with get_db_connection() as conn:
+                c = conn.cursor()
+                c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
+                result = c.fetchone()
+            if result:
+                content = eval(result[0])  # Convert string back to list
+            else:
+                content = await get_page_content(session, current_url)
+                with get_db_connection() as conn:
+                    c = conn.cursor()
+                    c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
+                    conn.commit()
             all_pages.append((current_url, content))
             logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
             if depth < max_depth:
                 links = await get_links(session, current_url, base_url)
                 for link in links:
+                    if link not in visited:
                         to_visit.append((link, depth + 1))
     return all_pages