bluenevus commited on
Commit
3696013
·
verified ·
1 Parent(s): fda7b65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -2
app.py CHANGED
@@ -12,6 +12,7 @@ from aiolimiter import AsyncLimiter
12
  import sqlite3
13
  from contextlib import contextmanager
14
  from threading import local
 
15
 
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
@@ -36,6 +37,7 @@ def init_db():
36
  c = conn.cursor()
37
  c.execute('''CREATE TABLE IF NOT EXISTS pages
38
  (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
 
39
  conn.commit()
40
 
41
  init_db()
@@ -102,6 +104,7 @@ async def crawl_pages(base_url, max_depth):
102
  continue
103
 
104
  visited.add(current_url)
 
105
 
106
  with get_db_connection() as conn:
107
  c = conn.cursor()
@@ -118,7 +121,7 @@ async def crawl_pages(base_url, max_depth):
118
  conn.commit()
119
 
120
  all_pages.append((current_url, content))
121
- logger.info(f"Processed page: {current_url} at depth {depth}")
122
 
123
  if depth < max_depth:
124
  links = await get_links(session, current_url, base_url)
@@ -144,7 +147,8 @@ def website_to_pdf(all_pages):
144
  pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
145
  except Exception as e:
146
  logger.error(f"Error writing text to PDF: {str(e)}")
147
- pdf.add_page()
 
148
 
149
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
150
  pdf_path = tmp.name
 
12
  import sqlite3
13
  from contextlib import contextmanager
14
  from threading import local
15
+ import concurrent.futures
16
 
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
 
37
  c = conn.cursor()
38
  c.execute('''CREATE TABLE IF NOT EXISTS pages
39
  (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
40
+ c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
41
  conn.commit()
42
 
43
  init_db()
 
104
  continue
105
 
106
  visited.add(current_url)
107
+ start_time = time.time()
108
 
109
  with get_db_connection() as conn:
110
  c = conn.cursor()
 
121
  conn.commit()
122
 
123
  all_pages.append((current_url, content))
124
+ logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
125
 
126
  if depth < max_depth:
127
  links = await get_links(session, current_url, base_url)
 
147
  pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
148
  except Exception as e:
149
  logger.error(f"Error writing text to PDF: {str(e)}")
150
+ if pdf.get_y() > 250: # Add a new page if the current page is almost full
151
+ pdf.add_page()
152
 
153
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
154
  pdf_path = tmp.name