Spaces:

MicroHealth
/

website-to-pdf

Sleeping

bluenevus commited on Apr 21

Commit

3696013

verified ·

1 Parent(s): fda7b65

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from aiolimiter import AsyncLimiter
 import sqlite3
 from contextlib import contextmanager
 from threading import local
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -36,6 +37,7 @@ def init_db():
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS pages
                      (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
         conn.commit()
 init_db()
@@ -102,6 +104,7 @@ async def crawl_pages(base_url, max_depth):
                 continue
             visited.add(current_url)
             with get_db_connection() as conn:
                 c = conn.cursor()
@@ -118,7 +121,7 @@ async def crawl_pages(base_url, max_depth):
                     conn.commit()
             all_pages.append((current_url, content))
-            logger.info(f"Processed page: {current_url} at depth {depth}")
             if depth < max_depth:
                 links = await get_links(session, current_url, base_url)
@@ -144,7 +147,8 @@ def website_to_pdf(all_pages):
                 pdf.multi_cell(0, 10, txt=text[:200])  # Limit text length to avoid issues
             except Exception as e:
                 logger.error(f"Error writing text to PDF: {str(e)}")
-        pdf.add_page()
     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
         pdf_path = tmp.name

 import sqlite3
 from contextlib import contextmanager
 from threading import local
+import concurrent.futures
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS pages
                      (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
+        c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
         conn.commit()
 init_db()
                 continue
             visited.add(current_url)
+            start_time = time.time()
             with get_db_connection() as conn:
                 c = conn.cursor()
                     conn.commit()
             all_pages.append((current_url, content))
+            logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
             if depth < max_depth:
                 links = await get_links(session, current_url, base_url)
                 pdf.multi_cell(0, 10, txt=text[:200])  # Limit text length to avoid issues
             except Exception as e:
                 logger.error(f"Error writing text to PDF: {str(e)}")
+        if pdf.get_y() > 250:  # Add a new page if the current page is almost full
+            pdf.add_page()
     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
         pdf_path = tmp.name