bluenevus commited on
Commit
7e921f0
·
verified ·
1 Parent(s): bbd4d3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -32
app.py CHANGED
@@ -151,7 +151,7 @@ async def crawl_pages(base_url, max_depth):
151
 
152
  return all_pages
153
 
154
- def website_to_pdf(all_pages):
155
  logger.info(f"Starting PDF generation for {len(all_pages)} pages")
156
 
157
  pdf = FPDF()
@@ -159,25 +159,31 @@ def website_to_pdf(all_pages):
159
  pdf.add_page()
160
  pdf.set_font("Arial", size=12)
161
 
162
- for page_url, content in all_pages:
163
- pdf.cell(0, 10, txt=page_url, ln=True)
164
- pdf.ln(5)
165
- for text in content:
166
- try:
167
- pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
168
- except Exception as e:
169
- logger.error(f"Error writing text to PDF: {str(e)}")
170
- if pdf.get_y() > 250: # Add a new page if the current page is almost full
171
- pdf.add_page()
 
 
 
 
 
 
172
 
173
  return pdf.output(dest='S').encode('latin-1') # Return bytes instead of BytesIO object
174
 
175
- async def process_url(url, depth):
176
  try:
177
  all_pages = await crawl_pages(url, depth)
178
  if not all_pages:
179
  return "No pages were successfully crawled. Please check the URL and try again."
180
- pdf_content = website_to_pdf(all_pages)
181
  return pdf_content # This is now bytes, not BytesIO
182
  except Exception as e:
183
  logger.error(f"Error in process_url: {str(e)}")
@@ -211,6 +217,7 @@ app.layout = dbc.Container([
211
  dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
212
  dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
213
  dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
 
214
  ]),
215
  className="mt-4"
216
  )
@@ -218,34 +225,72 @@ app.layout = dbc.Container([
218
 
219
  @app.callback(
220
  Output("output-area", "children"),
 
221
  Input("submit-button", "n_clicks"),
 
222
  State("url-input", "value"),
223
  State("depth-slider", "value"),
224
  prevent_initial_call=True
225
  )
226
- def update_output(n_clicks, url, depth):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  if not url:
228
  return "Please enter a valid URL."
229
-
230
- pdf_content = asyncio.run(process_url(url, depth))
231
-
 
 
 
 
 
232
  if isinstance(pdf_content, str):
233
  return pdf_content # This is an error message
234
-
235
- try:
236
- encoded = base64.b64encode(pdf_content).decode()
237
-
238
- return html.Div([
239
- html.H4("PDF Generated Successfully"),
240
- html.A(
241
- dbc.Button("Download PDF", color="success", className="mt-2"),
242
- href=f"data:application/pdf;base64,{encoded}",
243
- download="website_content.pdf"
244
- )
245
- ])
246
- except Exception as e:
247
- logger.error(f"Error creating download link: {str(e)}")
248
- return f"An error occurred while creating the download link: {str(e)}"
249
 
250
  if __name__ == '__main__':
251
  print("Starting the Dash application...")
 
151
 
152
  return all_pages
153
 
154
+ def website_to_pdf(all_pages, progress_callback):
155
  logger.info(f"Starting PDF generation for {len(all_pages)} pages")
156
 
157
  pdf = FPDF()
 
159
  pdf.add_page()
160
  pdf.set_font("Arial", size=12)
161
 
162
+ batch_size = 100
163
+ for i in range(0, len(all_pages), batch_size):
164
+ batch = all_pages[i:i+batch_size]
165
+ for page_url, content in batch:
166
+ pdf.cell(0, 10, txt=page_url, ln=True)
167
+ pdf.ln(5)
168
+ for text in content:
169
+ try:
170
+ pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
171
+ except Exception as e:
172
+ logger.error(f"Error writing text to PDF: {str(e)}")
173
+ if pdf.get_y() > 250: # Add a new page if the current page is almost full
174
+ pdf.add_page()
175
+
176
+ progress = min((i + batch_size) / len(all_pages), 1.0)
177
+ progress_callback(f"Processing pages... {progress:.0%}")
178
 
179
  return pdf.output(dest='S').encode('latin-1') # Return bytes instead of BytesIO object
180
 
181
+ async def process_url(url, depth, progress_callback):
182
  try:
183
  all_pages = await crawl_pages(url, depth)
184
  if not all_pages:
185
  return "No pages were successfully crawled. Please check the URL and try again."
186
+ pdf_content = website_to_pdf(all_pages, progress_callback)
187
  return pdf_content # This is now bytes, not BytesIO
188
  except Exception as e:
189
  logger.error(f"Error in process_url: {str(e)}")
 
217
  dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
218
  dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
219
  dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
220
+ dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
221
  ]),
222
  className="mt-4"
223
  )
 
225
 
226
  @app.callback(
227
  Output("output-area", "children"),
228
+ Output("progress-interval", "disabled"),
229
  Input("submit-button", "n_clicks"),
230
+ Input("progress-interval", "n_intervals"),
231
  State("url-input", "value"),
232
  State("depth-slider", "value"),
233
  prevent_initial_call=True
234
  )
235
+ def update_output(n_clicks, n_intervals, url, depth):
236
+ ctx = dash.callback_context
237
+ if not ctx.triggered:
238
+ raise PreventUpdate
239
+
240
+ triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
241
+
242
+ if triggered_id == "submit-button":
243
+ if not url:
244
+ return "Please enter a valid URL.", True
245
+
246
+ return dcc.Store(id='pdf-store', data='processing'), False
247
+
248
+ elif triggered_id == "progress-interval":
249
+ store = dash.callback_context.inputs.get('pdf-store', None)
250
+ if store is None or store == 'processing':
251
+ return "Processing... Please wait.", False
252
+
253
+ if isinstance(store, str) and store.startswith("Error"):
254
+ return store, True
255
+
256
+ try:
257
+ encoded = base64.b64encode(store).decode()
258
+ return html.Div([
259
+ html.H4("PDF Generated Successfully"),
260
+ html.A(
261
+ dbc.Button("Download PDF", color="success", className="mt-2"),
262
+ href=f"data:application/pdf;base64,{encoded}",
263
+ download="website_content.pdf"
264
+ )
265
+ ]), True
266
+ except Exception as e:
267
+ logger.error(f"Error creating download link: {str(e)}")
268
+ return f"An error occurred while creating the download link: {str(e)}", True
269
+
270
+ raise PreventUpdate
271
+
272
+ @app.callback(
273
+ Output('pdf-store', 'data'),
274
+ Input('submit-button', 'n_clicks'),
275
+ State('url-input', 'value'),
276
+ State('depth-slider', 'value'),
277
+ prevent_initial_call=True
278
+ )
279
+ def generate_pdf(n_clicks, url, depth):
280
  if not url:
281
  return "Please enter a valid URL."
282
+
283
+ progress_store = {'progress': 0}
284
+
285
+ def progress_callback(message):
286
+ progress_store['progress'] = message
287
+
288
+ pdf_content = asyncio.run(process_url(url, depth, progress_callback))
289
+
290
  if isinstance(pdf_content, str):
291
  return pdf_content # This is an error message
292
+
293
+ return pdf_content
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  if __name__ == '__main__':
296
  print("Starting the Dash application...")