bluenevus commited on
Commit
e82072d
·
verified ·
1 Parent(s): 52e01cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -115
app.py CHANGED
@@ -195,118 +195,4 @@ def website_to_pdf(all_pages, progress_callback):
195
  progress_callback(f"Processing pages... {progress:.0%}")
196
  logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
197
 
198
- logger.info("Merging PDF chunks...")
199
- output_pdf = os.path.join(temp_dir, "final.pdf")
200
- merger = PdfMerger()
201
- for temp_file in temp_files:
202
- merger.append(temp_file)
203
- merger.write(output_pdf)
204
- merger.close()
205
-
206
- logger.info("PDF generation complete. Reading final PDF...")
207
- with open(output_pdf, 'rb') as f:
208
- return f.read()
209
-
210
- async def process_url(url, depth, progress_callback):
211
- try:
212
- all_pages = await crawl_pages(url, depth)
213
- if not all_pages:
214
- return "No pages were successfully crawled. Please check the URL and try again."
215
-
216
- logger.info("Crawling complete. Starting PDF generation...")
217
- # Use ThreadPoolExecutor to run PDF generation in a separate thread
218
- loop = asyncio.get_event_loop()
219
- pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
220
- logger.info("PDF generation complete.")
221
- return pdf_content
222
- except Exception as e:
223
- logger.error(f"Error in process_url: {str(e)}")
224
- return f"An error occurred: {str(e)}"
225
-
226
- # App layout
227
- app.layout = dbc.Container([
228
- dcc.Store(id='pdf-store'),
229
- dcc.Store(id='progress-store'),
230
- dbc.Card(
231
- dbc.CardBody([
232
- html.H1("Website to PDF Converter", className="text-center mb-4"),
233
- html.P("Enter docs URL and crawl depth to convert documentation pages into a PDF. Be responsible for sites you have permission to do this", className="text-center mb-4"),
234
- dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
235
- dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
236
- dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
237
- dbc.Button("Download PDF", id="download-button", color="secondary", className="mb-3 w-100", disabled=True),
238
- html.Div([
239
- dbc.Spinner(html.Div(id="progress-message"), color="primary", type="grow", size="lg"),
240
- ], className="text-center mb-3"),
241
- dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
242
- dcc.Download(id="download-pdf")
243
- ]),
244
- className="mt-4"
245
- )
246
- ], fluid=True)
247
-
248
- def update_output(n_clicks, n_intervals, progress_data, url, depth):
249
- ctx = dash.callback_context
250
- if not ctx.triggered:
251
- raise PreventUpdate
252
-
253
- triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
254
-
255
- if triggered_id == "submit-button":
256
- if not url:
257
- return True, "secondary", True, None, "Please enter a URL"
258
-
259
- # Start the background task
260
- task_id = str(uuid.uuid4())
261
- executor.submit(background_task, url, depth, task_id)
262
-
263
- return True, "secondary", False, None, "Processing... Please wait."
264
-
265
- elif triggered_id == "progress-interval" or triggered_id == "progress-store":
266
- if progress_data is None:
267
- return True, "secondary", False, None, "Processing... Please wait."
268
-
269
- if isinstance(progress_data, str):
270
- if progress_data.startswith("Error"):
271
- return True, "secondary", True, None, progress_data
272
- else:
273
- return True, "secondary", False, None, progress_data
274
-
275
- if isinstance(progress_data, bytes):
276
- encoded = base64.b64encode(progress_data).decode()
277
- return False, "primary", True, encoded, "PDF ready for download!"
278
-
279
- return True, "secondary", False, None, ""
280
-
281
- @app.callback(
282
- Output("download-pdf", "data"),
283
- Input("download-button", "n_clicks"),
284
- State("pdf-store", "data"),
285
- prevent_initial_call=True
286
- )
287
- def download_pdf(n_clicks, pdf_data):
288
- if pdf_data is None:
289
- raise PreventUpdate
290
-
291
- decoded = base64.b64decode(pdf_data)
292
- return dcc.send_bytes(decoded, f"website_content_{int(time.time())}.pdf")
293
-
294
- def background_task(url, depth, task_id):
295
- def progress_callback(message):
296
- # Update progress in the progress-store
297
- app.layout.children[1].data = message
298
-
299
- try:
300
- logger.info(f"Starting background task for URL: {url}, depth: {depth}")
301
- pdf_content = asyncio.run(process_url(url, depth, progress_callback))
302
- logger.info("Background task completed successfully")
303
- # Store the PDF content directly in the progress-store
304
- app.layout.children[1].data = pdf_content
305
- except Exception as e:
306
- logger.error(f"Error in background task: {str(e)}")
307
- app.layout.children[1].data = f"Error: {str(e)}"
308
-
309
- if __name__ == '__main__':
310
- print("Starting the Dash application...")
311
- app.run(debug=True, host='0.0.0.0', port=7860)
312
- print("Dash application has finished running.")
 
195
  progress_callback(f"Processing pages... {progress:.0%}")
196
  logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
197
 
198
+ logger.info("Mer