Spaces:

MicroHealth
/

website-to-pdf

Sleeping

App Files Files Community

bluenevus commited on Apr 21

Commit

fda7b65

verified ·

1 Parent(s): f63dcb0

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -13

app.py CHANGED Viewed

@@ -6,25 +6,40 @@ from fpdf import FPDF
 import tempfile
 import re
 import logging
-from concurrent.futures import ThreadPoolExecutor, as_completed
 import asyncio
 import aiohttp
 from aiolimiter import AsyncLimiter
 import sqlite3
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize SQLite database
-conn = sqlite3.connect('crawl_cache.db')
-c = conn.cursor()
-c.execute('''CREATE TABLE IF NOT EXISTS pages
-             (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
-conn.commit()
 # Rate limiter: 10 requests per second
 rate_limiter = AsyncLimiter(10, 1)
 def clean_text(text):
     text = ''.join(char for char in text if char.isprintable())
     text = re.sub(r'[^\x00-\x7F]+', ' ', text)
@@ -88,16 +103,19 @@ async def crawl_pages(base_url, max_depth):
             visited.add(current_url)
-            # Check if page is already in the database
-            c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
-            result = c.fetchone()
             if result:
                 content = eval(result[0])  # Convert string back to list
             else:
                 content = await get_page_content(session, current_url)
-                # Store in database
-                c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
-                conn.commit()
             all_pages.append((current_url, content))
             logger.info(f"Processed page: {current_url} at depth {depth}")

 import tempfile
 import re
 import logging
 import asyncio
 import aiohttp
 from aiolimiter import AsyncLimiter
 import sqlite3
+from contextlib import contextmanager
+from threading import local
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Thread-local storage for database connections
+thread_local = local()
 # Rate limiter: 10 requests per second
 rate_limiter = AsyncLimiter(10, 1)
+@contextmanager
+def get_db_connection():
+    if not hasattr(thread_local, "connection"):
+        thread_local.connection = sqlite3.connect('crawl_cache.db')
+    try:
+        yield thread_local.connection
+    finally:
+        pass  # We'll keep the connection open for reuse
+def init_db():
+    with get_db_connection() as conn:
+        c = conn.cursor()
+        c.execute('''CREATE TABLE IF NOT EXISTS pages
+                     (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
+        conn.commit()
+init_db()
 def clean_text(text):
     text = ''.join(char for char in text if char.isprintable())
     text = re.sub(r'[^\x00-\x7F]+', ' ', text)
             visited.add(current_url)
+            with get_db_connection() as conn:
+                c = conn.cursor()
+                c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
+                result = c.fetchone()
             if result:
                 content = eval(result[0])  # Convert string back to list
             else:
                 content = await get_page_content(session, current_url)
+                with get_db_connection() as conn:
+                    c = conn.cursor()
+                    c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
+                    conn.commit()
             all_pages.append((current_url, content))
             logger.info(f"Processed page: {current_url} at depth {depth}")