bluenevus commited on
Commit
0b1d7d6
·
verified ·
1 Parent(s): 1d098a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -34
app.py CHANGED
@@ -122,28 +122,32 @@ async def crawl_pages(base_url, max_depth):
122
  visited.add(current_url)
123
  start_time = time.time()
124
 
125
- with get_db_connection() as conn:
126
- c = conn.cursor()
127
- c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
128
- result = c.fetchone()
129
-
130
- if result:
131
- content = eval(result[0]) # Convert string back to list
132
- else:
133
- content = await get_page_content(session, current_url)
134
  with get_db_connection() as conn:
135
  c = conn.cursor()
136
- c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
137
- conn.commit()
 
 
 
 
 
 
 
 
 
138
 
139
- all_pages.append((current_url, content))
140
- logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
141
 
142
- if depth < max_depth:
143
- links = await get_links(session, current_url, base_url)
144
- for link in links:
145
- if link not in visited:
146
- to_visit.append((link, depth + 1))
 
 
 
147
 
148
  return all_pages
149
 
@@ -176,6 +180,8 @@ def website_to_pdf(all_pages):
176
  async def process_url(url, depth):
177
  try:
178
  all_pages = await crawl_pages(url, depth)
 
 
179
  pdf_file = website_to_pdf(all_pages)
180
  return pdf_file
181
  except Exception as e:
@@ -228,22 +234,28 @@ def update_output(n_clicks, url, depth):
228
 
229
  pdf_path = asyncio.run(process_url(url, depth))
230
 
231
- if pdf_path.startswith("An error occurred"):
232
- return pdf_path
233
-
234
- with open(pdf_path, "rb") as f:
235
- encoded = base64.b64encode(f.read()).decode()
236
 
237
- os.unlink(pdf_path) # Remove the temporary file
238
-
239
- return html.Div([
240
- html.H4("PDF Generated Successfully"),
241
- html.A(
242
- dbc.Button("Download PDF", color="success", className="mt-2"),
243
- href=f"data:application/pdf;base64,{encoded}",
244
- download="website_content.pdf"
245
- )
246
- ])
 
 
 
 
 
 
 
247
 
248
  if __name__ == '__main__':
249
- app.run(debug=True, host='0.0.0.0', port=7860)
 
 
 
122
  visited.add(current_url)
123
  start_time = time.time()
124
 
125
+ try:
 
 
 
 
 
 
 
 
126
  with get_db_connection() as conn:
127
  c = conn.cursor()
128
+ c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
129
+ result = c.fetchone()
130
+
131
+ if result:
132
+ content = eval(result[0]) # Convert string back to list
133
+ else:
134
+ content = await get_page_content(session, current_url)
135
+ with get_db_connection() as conn:
136
+ c = conn.cursor()
137
+ c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
138
+ conn.commit()
139
 
140
+ all_pages.append((current_url, content))
141
+ logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
142
 
143
+ if depth < max_depth:
144
+ links = await get_links(session, current_url, base_url)
145
+ for link in links:
146
+ if link not in visited:
147
+ to_visit.append((link, depth + 1))
148
+ except Exception as e:
149
+ logger.error(f"Error processing {current_url}: {str(e)}")
150
+ # Continue with the next URL even if this one fails
151
 
152
  return all_pages
153
 
 
180
  async def process_url(url, depth):
181
  try:
182
  all_pages = await crawl_pages(url, depth)
183
+ if not all_pages:
184
+ return "No pages were successfully crawled. Please check the URL and try again."
185
  pdf_file = website_to_pdf(all_pages)
186
  return pdf_file
187
  except Exception as e:
 
234
 
235
  pdf_path = asyncio.run(process_url(url, depth))
236
 
237
+ if isinstance(pdf_path, str):
238
+ return pdf_path # This is an error message
 
 
 
239
 
240
+ try:
241
+ with open(pdf_path, "rb") as f:
242
+ encoded = base64.b64encode(f.read()).decode()
243
+
244
+ os.unlink(pdf_path) # Remove the temporary file
245
+
246
+ return html.Div([
247
+ html.H4("PDF Generated Successfully"),
248
+ html.A(
249
+ dbc.Button("Download PDF", color="success", className="mt-2"),
250
+ href=f"data:application/pdf;base64,{encoded}",
251
+ download="website_content.pdf"
252
+ )
253
+ ])
254
+ except Exception as e:
255
+ logger.error(f"Error creating download link: {str(e)}")
256
+ return f"An error occurred while creating the download link: {str(e)}"
257
 
258
  if __name__ == '__main__':
259
+ print("Starting the Dash application...")
260
+ app.run(debug=True, host='0.0.0.0', port=7860)
261
+ print("Dash application has finished running.")