bluenevus commited on
Commit
fda7b65
·
verified ·
1 Parent(s): f63dcb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -13
app.py CHANGED
@@ -6,25 +6,40 @@ from fpdf import FPDF
6
  import tempfile
7
  import re
8
  import logging
9
- from concurrent.futures import ThreadPoolExecutor, as_completed
10
  import asyncio
11
  import aiohttp
12
  from aiolimiter import AsyncLimiter
13
  import sqlite3
 
 
14
 
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
- # Initialize SQLite database
19
- conn = sqlite3.connect('crawl_cache.db')
20
- c = conn.cursor()
21
- c.execute('''CREATE TABLE IF NOT EXISTS pages
22
- (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
23
- conn.commit()
24
 
25
  # Rate limiter: 10 requests per second
26
  rate_limiter = AsyncLimiter(10, 1)
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def clean_text(text):
29
  text = ''.join(char for char in text if char.isprintable())
30
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
@@ -88,16 +103,19 @@ async def crawl_pages(base_url, max_depth):
88
 
89
  visited.add(current_url)
90
 
91
- # Check if page is already in the database
92
- c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
93
- result = c.fetchone()
 
 
94
  if result:
95
  content = eval(result[0]) # Convert string back to list
96
  else:
97
  content = await get_page_content(session, current_url)
98
- # Store in database
99
- c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
100
- conn.commit()
 
101
 
102
  all_pages.append((current_url, content))
103
  logger.info(f"Processed page: {current_url} at depth {depth}")
 
6
  import tempfile
7
  import re
8
  import logging
 
9
  import asyncio
10
  import aiohttp
11
  from aiolimiter import AsyncLimiter
12
  import sqlite3
13
+ from contextlib import contextmanager
14
+ from threading import local
15
 
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
+ # Thread-local storage for database connections
20
+ thread_local = local()
 
 
 
 
21
 
22
  # Rate limiter: 10 requests per second
23
  rate_limiter = AsyncLimiter(10, 1)
24
 
25
+ @contextmanager
26
+ def get_db_connection():
27
+ if not hasattr(thread_local, "connection"):
28
+ thread_local.connection = sqlite3.connect('crawl_cache.db')
29
+ try:
30
+ yield thread_local.connection
31
+ finally:
32
+ pass # We'll keep the connection open for reuse
33
+
34
+ def init_db():
35
+ with get_db_connection() as conn:
36
+ c = conn.cursor()
37
+ c.execute('''CREATE TABLE IF NOT EXISTS pages
38
+ (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
39
+ conn.commit()
40
+
41
+ init_db()
42
+
43
  def clean_text(text):
44
  text = ''.join(char for char in text if char.isprintable())
45
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
 
103
 
104
  visited.add(current_url)
105
 
106
+ with get_db_connection() as conn:
107
+ c = conn.cursor()
108
+ c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
109
+ result = c.fetchone()
110
+
111
  if result:
112
  content = eval(result[0]) # Convert string back to list
113
  else:
114
  content = await get_page_content(session, current_url)
115
+ with get_db_connection() as conn:
116
+ c = conn.cursor()
117
+ c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
118
+ conn.commit()
119
 
120
  all_pages.append((current_url, content))
121
  logger.info(f"Processed page: {current_url} at depth {depth}")