Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,25 +6,40 @@ from fpdf import FPDF
|
|
6 |
import tempfile
|
7 |
import re
|
8 |
import logging
|
9 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
import asyncio
|
11 |
import aiohttp
|
12 |
from aiolimiter import AsyncLimiter
|
13 |
import sqlite3
|
|
|
|
|
14 |
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
c = conn.cursor()
|
21 |
-
c.execute('''CREATE TABLE IF NOT EXISTS pages
|
22 |
-
(url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
|
23 |
-
conn.commit()
|
24 |
|
25 |
# Rate limiter: 10 requests per second
|
26 |
rate_limiter = AsyncLimiter(10, 1)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def clean_text(text):
|
29 |
text = ''.join(char for char in text if char.isprintable())
|
30 |
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
@@ -88,16 +103,19 @@ async def crawl_pages(base_url, max_depth):
|
|
88 |
|
89 |
visited.add(current_url)
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
94 |
if result:
|
95 |
content = eval(result[0]) # Convert string back to list
|
96 |
else:
|
97 |
content = await get_page_content(session, current_url)
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
101 |
|
102 |
all_pages.append((current_url, content))
|
103 |
logger.info(f"Processed page: {current_url} at depth {depth}")
|
|
|
6 |
import tempfile
|
7 |
import re
|
8 |
import logging
|
|
|
9 |
import asyncio
|
10 |
import aiohttp
|
11 |
from aiolimiter import AsyncLimiter
|
12 |
import sqlite3
|
13 |
+
from contextlib import contextmanager
|
14 |
+
from threading import local
|
15 |
|
16 |
logging.basicConfig(level=logging.INFO)
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
+
# Thread-local storage for database connections
|
20 |
+
thread_local = local()
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Rate limiter: 10 requests per second
|
23 |
rate_limiter = AsyncLimiter(10, 1)
|
24 |
|
25 |
+
@contextmanager
|
26 |
+
def get_db_connection():
|
27 |
+
if not hasattr(thread_local, "connection"):
|
28 |
+
thread_local.connection = sqlite3.connect('crawl_cache.db')
|
29 |
+
try:
|
30 |
+
yield thread_local.connection
|
31 |
+
finally:
|
32 |
+
pass # We'll keep the connection open for reuse
|
33 |
+
|
34 |
+
def init_db():
|
35 |
+
with get_db_connection() as conn:
|
36 |
+
c = conn.cursor()
|
37 |
+
c.execute('''CREATE TABLE IF NOT EXISTS pages
|
38 |
+
(url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
|
39 |
+
conn.commit()
|
40 |
+
|
41 |
+
init_db()
|
42 |
+
|
43 |
def clean_text(text):
|
44 |
text = ''.join(char for char in text if char.isprintable())
|
45 |
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
|
|
103 |
|
104 |
visited.add(current_url)
|
105 |
|
106 |
+
with get_db_connection() as conn:
|
107 |
+
c = conn.cursor()
|
108 |
+
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
109 |
+
result = c.fetchone()
|
110 |
+
|
111 |
if result:
|
112 |
content = eval(result[0]) # Convert string back to list
|
113 |
else:
|
114 |
content = await get_page_content(session, current_url)
|
115 |
+
with get_db_connection() as conn:
|
116 |
+
c = conn.cursor()
|
117 |
+
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
|
118 |
+
conn.commit()
|
119 |
|
120 |
all_pages.append((current_url, content))
|
121 |
logger.info(f"Processed page: {current_url} at depth {depth}")
|