Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ from aiolimiter import AsyncLimiter
|
|
12 |
import sqlite3
|
13 |
from contextlib import contextmanager
|
14 |
from threading import local
|
|
|
15 |
|
16 |
logging.basicConfig(level=logging.INFO)
|
17 |
logger = logging.getLogger(__name__)
|
@@ -36,6 +37,7 @@ def init_db():
|
|
36 |
c = conn.cursor()
|
37 |
c.execute('''CREATE TABLE IF NOT EXISTS pages
|
38 |
(url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
|
|
|
39 |
conn.commit()
|
40 |
|
41 |
init_db()
|
@@ -102,6 +104,7 @@ async def crawl_pages(base_url, max_depth):
|
|
102 |
continue
|
103 |
|
104 |
visited.add(current_url)
|
|
|
105 |
|
106 |
with get_db_connection() as conn:
|
107 |
c = conn.cursor()
|
@@ -118,7 +121,7 @@ async def crawl_pages(base_url, max_depth):
|
|
118 |
conn.commit()
|
119 |
|
120 |
all_pages.append((current_url, content))
|
121 |
-
logger.info(f"Processed page: {current_url} at depth {depth}")
|
122 |
|
123 |
if depth < max_depth:
|
124 |
links = await get_links(session, current_url, base_url)
|
@@ -144,7 +147,8 @@ def website_to_pdf(all_pages):
|
|
144 |
pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
|
145 |
except Exception as e:
|
146 |
logger.error(f"Error writing text to PDF: {str(e)}")
|
147 |
-
pdf.
|
|
|
148 |
|
149 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
150 |
pdf_path = tmp.name
|
|
|
12 |
import sqlite3
|
13 |
from contextlib import contextmanager
|
14 |
from threading import local
|
15 |
+
import concurrent.futures
|
16 |
|
17 |
logging.basicConfig(level=logging.INFO)
|
18 |
logger = logging.getLogger(__name__)
|
|
|
37 |
c = conn.cursor()
|
38 |
c.execute('''CREATE TABLE IF NOT EXISTS pages
|
39 |
(url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
|
40 |
+
c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
|
41 |
conn.commit()
|
42 |
|
43 |
init_db()
|
|
|
104 |
continue
|
105 |
|
106 |
visited.add(current_url)
|
107 |
+
start_time = time.time()
|
108 |
|
109 |
with get_db_connection() as conn:
|
110 |
c = conn.cursor()
|
|
|
121 |
conn.commit()
|
122 |
|
123 |
all_pages.append((current_url, content))
|
124 |
+
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
125 |
|
126 |
if depth < max_depth:
|
127 |
links = await get_links(session, current_url, base_url)
|
|
|
147 |
pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
|
148 |
except Exception as e:
|
149 |
logger.error(f"Error writing text to PDF: {str(e)}")
|
150 |
+
if pdf.get_y() > 250: # Add a new page if the current page is almost full
|
151 |
+
pdf.add_page()
|
152 |
|
153 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
154 |
pdf_path = tmp.name
|