Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -122,28 +122,32 @@ async def crawl_pages(base_url, max_depth):
|
|
122 |
visited.add(current_url)
|
123 |
start_time = time.time()
|
124 |
|
125 |
-
|
126 |
-
c = conn.cursor()
|
127 |
-
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
128 |
-
result = c.fetchone()
|
129 |
-
|
130 |
-
if result:
|
131 |
-
content = eval(result[0]) # Convert string back to list
|
132 |
-
else:
|
133 |
-
content = await get_page_content(session, current_url)
|
134 |
with get_db_connection() as conn:
|
135 |
c = conn.cursor()
|
136 |
-
c.execute("
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
-
|
140 |
-
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
147 |
|
148 |
return all_pages
|
149 |
|
@@ -176,6 +180,8 @@ def website_to_pdf(all_pages):
|
|
176 |
async def process_url(url, depth):
|
177 |
try:
|
178 |
all_pages = await crawl_pages(url, depth)
|
|
|
|
|
179 |
pdf_file = website_to_pdf(all_pages)
|
180 |
return pdf_file
|
181 |
except Exception as e:
|
@@ -228,22 +234,28 @@ def update_output(n_clicks, url, depth):
|
|
228 |
|
229 |
pdf_path = asyncio.run(process_url(url, depth))
|
230 |
|
231 |
-
if pdf_path
|
232 |
-
return pdf_path
|
233 |
-
|
234 |
-
with open(pdf_path, "rb") as f:
|
235 |
-
encoded = base64.b64encode(f.read()).decode()
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
if __name__ == '__main__':
|
249 |
-
|
|
|
|
|
|
122 |
visited.add(current_url)
|
123 |
start_time = time.time()
|
124 |
|
125 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
with get_db_connection() as conn:
|
127 |
c = conn.cursor()
|
128 |
+
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
129 |
+
result = c.fetchone()
|
130 |
+
|
131 |
+
if result:
|
132 |
+
content = eval(result[0]) # Convert string back to list
|
133 |
+
else:
|
134 |
+
content = await get_page_content(session, current_url)
|
135 |
+
with get_db_connection() as conn:
|
136 |
+
c = conn.cursor()
|
137 |
+
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
|
138 |
+
conn.commit()
|
139 |
|
140 |
+
all_pages.append((current_url, content))
|
141 |
+
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
142 |
|
143 |
+
if depth < max_depth:
|
144 |
+
links = await get_links(session, current_url, base_url)
|
145 |
+
for link in links:
|
146 |
+
if link not in visited:
|
147 |
+
to_visit.append((link, depth + 1))
|
148 |
+
except Exception as e:
|
149 |
+
logger.error(f"Error processing {current_url}: {str(e)}")
|
150 |
+
# Continue with the next URL even if this one fails
|
151 |
|
152 |
return all_pages
|
153 |
|
|
|
180 |
async def process_url(url, depth):
|
181 |
try:
|
182 |
all_pages = await crawl_pages(url, depth)
|
183 |
+
if not all_pages:
|
184 |
+
return "No pages were successfully crawled. Please check the URL and try again."
|
185 |
pdf_file = website_to_pdf(all_pages)
|
186 |
return pdf_file
|
187 |
except Exception as e:
|
|
|
234 |
|
235 |
pdf_path = asyncio.run(process_url(url, depth))
|
236 |
|
237 |
+
if isinstance(pdf_path, str):
|
238 |
+
return pdf_path # This is an error message
|
|
|
|
|
|
|
239 |
|
240 |
+
try:
|
241 |
+
with open(pdf_path, "rb") as f:
|
242 |
+
encoded = base64.b64encode(f.read()).decode()
|
243 |
+
|
244 |
+
os.unlink(pdf_path) # Remove the temporary file
|
245 |
+
|
246 |
+
return html.Div([
|
247 |
+
html.H4("PDF Generated Successfully"),
|
248 |
+
html.A(
|
249 |
+
dbc.Button("Download PDF", color="success", className="mt-2"),
|
250 |
+
href=f"data:application/pdf;base64,{encoded}",
|
251 |
+
download="website_content.pdf"
|
252 |
+
)
|
253 |
+
])
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Error creating download link: {str(e)}")
|
256 |
+
return f"An error occurred while creating the download link: {str(e)}"
|
257 |
|
258 |
if __name__ == '__main__':
|
259 |
+
print("Starting the Dash application...")
|
260 |
+
app.run(debug=True, host='0.0.0.0', port=7860)
|
261 |
+
print("Dash application has finished running.")
|