bluenevus commited on
Commit
512b4c4
·
verified ·
1 Parent(s): 69b83fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -85
app.py CHANGED
@@ -19,17 +19,16 @@ import time
19
  import os
20
  import ssl
21
  from io import BytesIO
 
 
22
  from concurrent.futures import ThreadPoolExecutor
23
- import math
24
- from PyPDF2 import PdfMerger
25
-
26
 
27
  # Initialize Dash app
28
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
29
  server = app.server
30
 
31
  # Logging setup
32
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
33
  logger = logging.getLogger(__name__)
34
 
35
  # Thread-local storage for database connections
@@ -43,6 +42,9 @@ ssl_context = ssl.create_default_context()
43
  ssl_context.check_hostname = False
44
  ssl_context.verify_mode = ssl.CERT_NONE
45
 
 
 
 
46
  @contextmanager
47
  def get_db_connection():
48
  if not hasattr(thread_local, "connection"):
@@ -112,7 +114,7 @@ async def get_links(session, url, base_url):
112
  logger.error(f"Error getting links from {url}: {str(e)}")
113
  return []
114
 
115
- async def crawl_pages(base_url, max_depth, progress_callback):
116
  visited = set()
117
  to_visit = [(base_url, 0)]
118
  all_pages = []
@@ -144,9 +146,6 @@ async def crawl_pages(base_url, max_depth, progress_callback):
144
  all_pages.append((current_url, content))
145
  logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
146
 
147
- progress = len(all_pages) / (max_depth * 10) # Rough estimate
148
- progress_callback(f"Crawling pages... {progress:.0%}")
149
-
150
  if depth < max_depth:
151
  links = await get_links(session, current_url, base_url)
152
  for link in links:
@@ -158,70 +157,63 @@ async def crawl_pages(base_url, max_depth, progress_callback):
158
 
159
  return all_pages
160
 
161
- def create_pdf_chunk(chunk, start_index):
162
  pdf = FPDF()
163
  pdf.set_auto_page_break(auto=True, margin=15)
164
  pdf.add_page()
165
  pdf.set_font("Arial", size=12)
166
 
167
- for i, (page_url, content) in enumerate(chunk, start=start_index):
168
- pdf.cell(0, 10, txt=f"Page {i+1}: {page_url}", ln=True)
169
  pdf.ln(5)
170
  for text in content:
171
  try:
172
  pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
173
  except Exception as e:
174
  logger.error(f"Error writing text to PDF: {str(e)}")
175
- pdf.add_page()
 
176
 
177
- return pdf.output(dest='S').encode('latin-1')
178
 
179
- async def website_to_pdf(all_pages, progress_callback):
180
  logger.info(f"Starting PDF generation for {len(all_pages)} pages")
181
 
182
  chunk_size = 100
183
- num_chunks = math.ceil(len(all_pages) / chunk_size)
184
- pdf_chunks = []
185
-
186
- with ThreadPoolExecutor() as executor:
187
- futures = []
188
- for i in range(num_chunks):
189
- start = i * chunk_size
190
- end = min((i + 1) * chunk_size, len(all_pages))
191
- chunk = all_pages[start:end]
192
- future = executor.submit(create_pdf_chunk, chunk, start)
193
- futures.append(future)
194
-
195
- for i, future in enumerate(futures):
196
- try:
197
- pdf_chunk = await asyncio.wrap_future(future)
198
- pdf_chunks.append(pdf_chunk)
199
- progress = (i + 1) / num_chunks
200
- progress_callback(f"Generating PDF... {progress:.0%}")
201
- except Exception as e:
202
- logger.error(f"Error generating PDF chunk {i}: {str(e)}")
203
-
204
- # Combine PDF chunks using PyPDF2
205
- merger = PdfMerger()
206
- for chunk in pdf_chunks:
207
- merger.append(BytesIO(chunk))
208
-
209
- output = BytesIO()
210
- merger.write(output)
211
- merger.close()
212
-
213
- return output.getvalue()
214
 
215
  async def process_url(url, depth, progress_callback):
216
  try:
217
- all_pages = await asyncio.wait_for(crawl_pages(url, depth, progress_callback), timeout=3600) # 1 hour timeout
218
  if not all_pages:
219
  return "No pages were successfully crawled. Please check the URL and try again."
220
- pdf_content = await asyncio.wait_for(website_to_pdf(all_pages, progress_callback), timeout=3600) # 1 hour timeout for PDF generation
 
 
 
221
  return pdf_content
222
- except asyncio.TimeoutError:
223
- logger.error("Process timed out after 1 hour")
224
- return "The process timed out after 1 hour. Please try again with a smaller depth or a more specific URL."
225
  except Exception as e:
226
  logger.error(f"Error in process_url: {str(e)}")
227
  return f"An error occurred: {str(e)}"
@@ -255,7 +247,7 @@ app.layout = dbc.Container([
255
  dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
256
  dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
257
  dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
258
- dbc.Progress(id="progress-bar", animated=True, striped=True, className="mb-3"),
259
  dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
260
  dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
261
  ]),
@@ -266,16 +258,14 @@ app.layout = dbc.Container([
266
  @app.callback(
267
  Output("output-area", "children"),
268
  Output("progress-interval", "disabled"),
269
- Output("progress-bar", "value"),
270
- Output("progress-bar", "label"),
271
  Input("submit-button", "n_clicks"),
272
  Input("progress-interval", "n_intervals"),
273
  State("url-input", "value"),
274
  State("depth-slider", "value"),
275
- State("progress-store", "data"),
276
  prevent_initial_call=True
277
  )
278
- def update_output(n_clicks, n_intervals, url, depth, progress):
279
  ctx = dash.callback_context
280
  if not ctx.triggered:
281
  raise PreventUpdate
@@ -284,22 +274,29 @@ def update_output(n_clicks, n_intervals, url, depth, progress):
284
 
285
  if triggered_id == "submit-button":
286
  if not url:
287
- return "Please enter a valid URL.", True, 0, ""
288
 
289
- return "Processing... Please wait.", False, 0, "0%"
 
 
 
 
290
 
291
  elif triggered_id == "progress-interval":
292
- store = dash.callback_context.inputs.get('pdf-store', None)
293
- if store is None:
294
- if progress:
295
- return "Processing... Please wait.", False, int(progress.split('%')[0]), progress
296
- return "Processing... Please wait.", False, 0, "0%"
297
-
298
- if isinstance(store, str) and store.startswith("Error"):
299
- return store, True, 100, "100%"
300
-
 
 
 
301
  try:
302
- encoded = base64.b64encode(store).decode()
303
  return html.Div([
304
  html.H4("PDF Generated Successfully"),
305
  html.A(
@@ -307,36 +304,38 @@ def update_output(n_clicks, n_intervals, url, depth, progress):
307
  href=f"data:application/pdf;base64,{encoded}",
308
  download="website_content.pdf"
309
  )
310
- ]), True, 100, "100%"
311
  except Exception as e:
312
  logger.error(f"Error creating download link: {str(e)}")
313
- return f"An error occurred while creating the download link: {str(e)}", True, 100, "100%"
314
 
315
  raise PreventUpdate
316
 
317
  @app.callback(
318
- Output('pdf-store', 'data'),
319
  Output('progress-store', 'data'),
320
- Input('submit-button', 'n_clicks'),
321
- State('url-input', 'value'),
322
- State('depth-slider', 'value'),
323
  prevent_initial_call=True
324
  )
325
- def generate_pdf(n_clicks, url, depth):
326
- if not url:
327
- return "Please enter a valid URL.", "0%"
328
-
329
- progress_store = {'progress': "0%"}
330
 
 
331
  def progress_callback(message):
332
- progress_store['progress'] = message
333
-
334
- pdf_content = asyncio.run(process_url(url, depth, progress_callback))
335
 
336
- if isinstance(pdf_content, str):
337
- return pdf_content, "100%" # This is an error message
338
-
339
- return pdf_content, "100%"
 
 
 
 
 
340
 
341
  if __name__ == '__main__':
342
  print("Starting the Dash application...")
 
19
  import os
20
  import ssl
21
  from io import BytesIO
22
+ import tempfile
23
+ import uuid
24
  from concurrent.futures import ThreadPoolExecutor
 
 
 
25
 
26
  # Initialize Dash app
27
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
28
  server = app.server
29
 
30
  # Logging setup
31
+ logging.basicConfig(level=logging.INFO)
32
  logger = logging.getLogger(__name__)
33
 
34
  # Thread-local storage for database connections
 
42
  ssl_context.check_hostname = False
43
  ssl_context.verify_mode = ssl.CERT_NONE
44
 
45
+ # ThreadPoolExecutor for background tasks
46
+ executor = ThreadPoolExecutor(max_workers=4)
47
+
48
  @contextmanager
49
  def get_db_connection():
50
  if not hasattr(thread_local, "connection"):
 
114
  logger.error(f"Error getting links from {url}: {str(e)}")
115
  return []
116
 
117
+ async def crawl_pages(base_url, max_depth):
118
  visited = set()
119
  to_visit = [(base_url, 0)]
120
  all_pages = []
 
146
  all_pages.append((current_url, content))
147
  logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
148
 
 
 
 
149
  if depth < max_depth:
150
  links = await get_links(session, current_url, base_url)
151
  for link in links:
 
157
 
158
  return all_pages
159
 
160
+ def generate_pdf_chunk(chunk, output_file):
161
  pdf = FPDF()
162
  pdf.set_auto_page_break(auto=True, margin=15)
163
  pdf.add_page()
164
  pdf.set_font("Arial", size=12)
165
 
166
+ for page_url, content in chunk:
167
+ pdf.cell(0, 10, txt=page_url, ln=True)
168
  pdf.ln(5)
169
  for text in content:
170
  try:
171
  pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
172
  except Exception as e:
173
  logger.error(f"Error writing text to PDF: {str(e)}")
174
+ if pdf.get_y() > 250: # Add a new page if the current page is almost full
175
+ pdf.add_page()
176
 
177
+ pdf.output(output_file)
178
 
179
+ def website_to_pdf(all_pages, progress_callback):
180
  logger.info(f"Starting PDF generation for {len(all_pages)} pages")
181
 
182
  chunk_size = 100
183
+ total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
184
+ temp_files = []
185
+
186
+ with tempfile.TemporaryDirectory() as temp_dir:
187
+ for i in range(0, len(all_pages), chunk_size):
188
+ chunk = all_pages[i:i+chunk_size]
189
+ temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
190
+ generate_pdf_chunk(chunk, temp_file)
191
+ temp_files.append(temp_file)
192
+
193
+ progress = min((i + chunk_size) / len(all_pages), 1.0)
194
+ progress_callback(f"Processing pages... {progress:.0%}")
195
+
196
+ # Merge PDF chunks
197
+ output_pdf = os.path.join(temp_dir, "final.pdf")
198
+ merger = PdfMerger()
199
+ for temp_file in temp_files:
200
+ merger.append(temp_file)
201
+ merger.write(output_pdf)
202
+ merger.close()
203
+
204
+ with open(output_pdf, 'rb') as f:
205
+ return f.read()
 
 
 
 
 
 
 
 
206
 
207
  async def process_url(url, depth, progress_callback):
208
  try:
209
+ all_pages = await crawl_pages(url, depth)
210
  if not all_pages:
211
  return "No pages were successfully crawled. Please check the URL and try again."
212
+
213
+ # Use ThreadPoolExecutor to run PDF generation in a separate thread
214
+ loop = asyncio.get_event_loop()
215
+ pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
216
  return pdf_content
 
 
 
217
  except Exception as e:
218
  logger.error(f"Error in process_url: {str(e)}")
219
  return f"An error occurred: {str(e)}"
 
247
  dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
248
  dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
249
  dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
250
+ dbc.Progress(id="progress-bar", style={"visibility": "hidden"}),
251
  dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
252
  dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
253
  ]),
 
258
  @app.callback(
259
  Output("output-area", "children"),
260
  Output("progress-interval", "disabled"),
261
+ Output("progress-bar", "style"),
 
262
  Input("submit-button", "n_clicks"),
263
  Input("progress-interval", "n_intervals"),
264
  State("url-input", "value"),
265
  State("depth-slider", "value"),
 
266
  prevent_initial_call=True
267
  )
268
+ def update_output(n_clicks, n_intervals, url, depth):
269
  ctx = dash.callback_context
270
  if not ctx.triggered:
271
  raise PreventUpdate
 
274
 
275
  if triggered_id == "submit-button":
276
  if not url:
277
+ return "Please enter a valid URL.", True, {"visibility": "hidden"}
278
 
279
+ # Start the background task
280
+ task_id = str(uuid.uuid4())
281
+ executor.submit(background_task, url, depth, task_id)
282
+
283
+ return "Processing... Please wait.", False, {"visibility": "visible"}
284
 
285
  elif triggered_id == "progress-interval":
286
+ # Check progress
287
+ progress = dash.callback_context.inputs['progress-store.data']
288
+ if progress is None:
289
+ return "Processing... Please wait.", False, {"visibility": "visible"}
290
+
291
+ if isinstance(progress, str) and progress.startswith("Error"):
292
+ return progress, True, {"visibility": "hidden"}
293
+
294
+ if isinstance(progress, str) and progress.startswith("Processing"):
295
+ return progress, False, {"visibility": "visible"}
296
+
297
+ # PDF generation complete
298
  try:
299
+ encoded = base64.b64encode(progress).decode()
300
  return html.Div([
301
  html.H4("PDF Generated Successfully"),
302
  html.A(
 
304
  href=f"data:application/pdf;base64,{encoded}",
305
  download="website_content.pdf"
306
  )
307
+ ]), True, {"visibility": "hidden"}
308
  except Exception as e:
309
  logger.error(f"Error creating download link: {str(e)}")
310
+ return f"An error occurred while creating the download link: {str(e)}", True, {"visibility": "hidden"}
311
 
312
  raise PreventUpdate
313
 
314
  @app.callback(
 
315
  Output('progress-store', 'data'),
316
+ Input('progress-interval', 'n_intervals'),
 
 
317
  prevent_initial_call=True
318
  )
319
+ def update_progress(n):
320
+ # This function will be called every second to update the progress
321
+ # You can implement a mechanism to check the actual progress of the PDF generation
322
+ # For now, we'll just return a placeholder message
323
+ return "Processing... Please wait."
324
 
325
+ def background_task(url, depth, task_id):
326
  def progress_callback(message):
327
+ # Update progress in the database or a shared data structure
328
+ pass
 
329
 
330
+ try:
331
+ pdf_content = asyncio.run(process_url(url, depth, progress_callback))
332
+ # Store the result in a database or shared data structure
333
+ # For simplicity, we'll use the progress-store, but in a real application,
334
+ # you should use a more robust solution for storing large data
335
+ app.layout.children[1].data = pdf_content
336
+ except Exception as e:
337
+ logger.error(f"Error in background task: {str(e)}")
338
+ app.layout.children[1].data = f"Error: {str(e)}"
339
 
340
  if __name__ == '__main__':
341
  print("Starting the Dash application...")