bluenevus commited on
Commit
e037a10
·
verified ·
1 Parent(s): 94b0270

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -40
app.py CHANGED
@@ -19,13 +19,15 @@ import time
19
  import os
20
  import ssl
21
  from io import BytesIO
 
 
22
 
23
  # Initialize Dash app
24
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
25
  server = app.server
26
 
27
  # Logging setup
28
- logging.basicConfig(level=logging.INFO)
29
  logger = logging.getLogger(__name__)
30
 
31
  # Thread-local storage for database connections
@@ -108,7 +110,7 @@ async def get_links(session, url, base_url):
108
  logger.error(f"Error getting links from {url}: {str(e)}")
109
  return []
110
 
111
- async def crawl_pages(base_url, max_depth):
112
  visited = set()
113
  to_visit = [(base_url, 0)]
114
  all_pages = []
@@ -140,6 +142,9 @@ async def crawl_pages(base_url, max_depth):
140
  all_pages.append((current_url, content))
141
  logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
142
 
 
 
 
143
  if depth < max_depth:
144
  links = await get_links(session, current_url, base_url)
145
  for link in links:
@@ -151,48 +156,75 @@ async def crawl_pages(base_url, max_depth):
151
 
152
  return all_pages
153
 
154
- def website_to_pdf(all_pages, progress_callback):
155
- logger.info(f"Starting PDF generation for {len(all_pages)} pages")
156
-
157
  pdf = FPDF()
158
  pdf.set_auto_page_break(auto=True, margin=15)
159
  pdf.add_page()
160
  pdf.set_font("Arial", size=12)
161
 
162
- batch_size = 100
163
- for i in range(0, len(all_pages), batch_size):
164
- batch = all_pages[i:i+batch_size]
165
- for page_url, content in batch:
166
- pdf.cell(0, 10, txt=page_url, ln=True)
167
- pdf.ln(5)
168
- for text in content:
169
- try:
170
- pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
171
- except Exception as e:
172
- logger.error(f"Error writing text to PDF: {str(e)}")
173
- if pdf.get_y() > 250: # Add a new page if the current page is almost full
174
- pdf.add_page()
175
-
176
- progress = min((i + batch_size) / len(all_pages), 1.0)
177
- progress_callback(f"Processing pages... {progress:.0%}")
178
-
179
- return pdf.output(dest='S').encode('latin-1') # Return bytes instead of BytesIO object
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
  async def process_url(url, depth, progress_callback):
182
  try:
183
- all_pages = await crawl_pages(url, depth)
184
  if not all_pages:
185
  return "No pages were successfully crawled. Please check the URL and try again."
186
- pdf_content = website_to_pdf(all_pages, progress_callback)
187
- return pdf_content # This is now bytes, not BytesIO
 
 
 
188
  except Exception as e:
189
  logger.error(f"Error in process_url: {str(e)}")
190
  return f"An error occurred: {str(e)}"
191
 
192
- # App layout
193
  # App layout
194
  app.layout = dbc.Container([
195
- dcc.Store(id='pdf-store'), # Add this line at the beginning of the layout
 
196
  dbc.Navbar(
197
  dbc.Container([
198
  html.A(
@@ -218,6 +250,7 @@ app.layout = dbc.Container([
218
  dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
219
  dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
220
  dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
 
221
  dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
222
  dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
223
  ]),
@@ -228,13 +261,16 @@ app.layout = dbc.Container([
228
  @app.callback(
229
  Output("output-area", "children"),
230
  Output("progress-interval", "disabled"),
 
 
231
  Input("submit-button", "n_clicks"),
232
  Input("progress-interval", "n_intervals"),
233
  State("url-input", "value"),
234
  State("depth-slider", "value"),
 
235
  prevent_initial_call=True
236
  )
237
- def update_output(n_clicks, n_intervals, url, depth):
238
  ctx = dash.callback_context
239
  if not ctx.triggered:
240
  raise PreventUpdate
@@ -243,17 +279,19 @@ def update_output(n_clicks, n_intervals, url, depth):
243
 
244
  if triggered_id == "submit-button":
245
  if not url:
246
- return "Please enter a valid URL.", True
247
 
248
- return dcc.Store(id='pdf-store', data='processing'), False
249
 
250
  elif triggered_id == "progress-interval":
251
  store = dash.callback_context.inputs.get('pdf-store', None)
252
- if store is None or store == 'processing':
253
- return "Processing... Please wait.", False
 
 
254
 
255
  if isinstance(store, str) and store.startswith("Error"):
256
- return store, True
257
 
258
  try:
259
  encoded = base64.b64encode(store).decode()
@@ -264,15 +302,16 @@ def update_output(n_clicks, n_intervals, url, depth):
264
  href=f"data:application/pdf;base64,{encoded}",
265
  download="website_content.pdf"
266
  )
267
- ]), True
268
  except Exception as e:
269
  logger.error(f"Error creating download link: {str(e)}")
270
- return f"An error occurred while creating the download link: {str(e)}", True
271
 
272
  raise PreventUpdate
273
 
274
  @app.callback(
275
  Output('pdf-store', 'data'),
 
276
  Input('submit-button', 'n_clicks'),
277
  State('url-input', 'value'),
278
  State('depth-slider', 'value'),
@@ -280,9 +319,9 @@ def update_output(n_clicks, n_intervals, url, depth):
280
  )
281
  def generate_pdf(n_clicks, url, depth):
282
  if not url:
283
- return "Please enter a valid URL."
284
 
285
- progress_store = {'progress': 0}
286
 
287
  def progress_callback(message):
288
  progress_store['progress'] = message
@@ -290,9 +329,9 @@ def generate_pdf(n_clicks, url, depth):
290
  pdf_content = asyncio.run(process_url(url, depth, progress_callback))
291
 
292
  if isinstance(pdf_content, str):
293
- return pdf_content # This is an error message
294
 
295
- return pdf_content
296
 
297
  if __name__ == '__main__':
298
  print("Starting the Dash application...")
 
19
  import os
20
  import ssl
21
  from io import BytesIO
22
+ from concurrent.futures import ThreadPoolExecutor
23
+ import math
24
 
25
  # Initialize Dash app
26
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
27
  server = app.server
28
 
29
  # Logging setup
30
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
  logger = logging.getLogger(__name__)
32
 
33
  # Thread-local storage for database connections
 
110
  logger.error(f"Error getting links from {url}: {str(e)}")
111
  return []
112
 
113
+ async def crawl_pages(base_url, max_depth, progress_callback):
114
  visited = set()
115
  to_visit = [(base_url, 0)]
116
  all_pages = []
 
142
  all_pages.append((current_url, content))
143
  logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
144
 
145
+ progress = len(all_pages) / (max_depth * 10) # Rough estimate
146
+ progress_callback(f"Crawling pages... {progress:.0%}")
147
+
148
  if depth < max_depth:
149
  links = await get_links(session, current_url, base_url)
150
  for link in links:
 
156
 
157
  return all_pages
158
 
159
+ def create_pdf_chunk(chunk, start_index):
 
 
160
  pdf = FPDF()
161
  pdf.set_auto_page_break(auto=True, margin=15)
162
  pdf.add_page()
163
  pdf.set_font("Arial", size=12)
164
 
165
+ for i, (page_url, content) in enumerate(chunk, start=start_index):
166
+ pdf.cell(0, 10, txt=f"Page {i+1}: {page_url}", ln=True)
167
+ pdf.ln(5)
168
+ for text in content:
169
+ try:
170
+ pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
171
+ except Exception as e:
172
+ logger.error(f"Error writing text to PDF: {str(e)}")
173
+ pdf.add_page()
174
+
175
+ return pdf.output(dest='S').encode('latin-1')
176
+
177
+ async def website_to_pdf(all_pages, progress_callback):
178
+ logger.info(f"Starting PDF generation for {len(all_pages)} pages")
179
+
180
+ chunk_size = 100
181
+ num_chunks = math.ceil(len(all_pages) / chunk_size)
182
+ pdf_chunks = []
183
+
184
+ with ThreadPoolExecutor() as executor:
185
+ futures = []
186
+ for i in range(num_chunks):
187
+ start = i * chunk_size
188
+ end = min((i + 1) * chunk_size, len(all_pages))
189
+ chunk = all_pages[start:end]
190
+ future = executor.submit(create_pdf_chunk, chunk, start)
191
+ futures.append(future)
192
+
193
+ for i, future in enumerate(futures):
194
+ try:
195
+ pdf_chunk = await asyncio.wrap_future(future)
196
+ pdf_chunks.append(pdf_chunk)
197
+ progress = (i + 1) / num_chunks
198
+ progress_callback(f"Generating PDF... {progress:.0%}")
199
+ except Exception as e:
200
+ logger.error(f"Error generating PDF chunk {i}: {str(e)}")
201
+
202
+ # Combine PDF chunks
203
+ combined_pdf = FPDF()
204
+ for chunk in pdf_chunks:
205
+ combined_pdf.add_page()
206
+ combined_pdf.put_file(chunk)
207
+
208
+ return combined_pdf.output(dest='S').encode('latin-1')
209
 
210
  async def process_url(url, depth, progress_callback):
211
  try:
212
+ all_pages = await asyncio.wait_for(crawl_pages(url, depth, progress_callback), timeout=3600) # 1 hour timeout
213
  if not all_pages:
214
  return "No pages were successfully crawled. Please check the URL and try again."
215
+ pdf_content = await asyncio.wait_for(website_to_pdf(all_pages, progress_callback), timeout=3600) # 1 hour timeout for PDF generation
216
+ return pdf_content
217
+ except asyncio.TimeoutError:
218
+ logger.error("Process timed out after 1 hour")
219
+ return "The process timed out after 1 hour. Please try again with a smaller depth or a more specific URL."
220
  except Exception as e:
221
  logger.error(f"Error in process_url: {str(e)}")
222
  return f"An error occurred: {str(e)}"
223
 
 
224
  # App layout
225
  app.layout = dbc.Container([
226
+ dcc.Store(id='pdf-store'),
227
+ dcc.Store(id='progress-store'),
228
  dbc.Navbar(
229
  dbc.Container([
230
  html.A(
 
250
  dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
251
  dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
252
  dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
253
+ dbc.Progress(id="progress-bar", animated=True, striped=True, className="mb-3"),
254
  dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
255
  dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
256
  ]),
 
261
  @app.callback(
262
  Output("output-area", "children"),
263
  Output("progress-interval", "disabled"),
264
+ Output("progress-bar", "value"),
265
+ Output("progress-bar", "label"),
266
  Input("submit-button", "n_clicks"),
267
  Input("progress-interval", "n_intervals"),
268
  State("url-input", "value"),
269
  State("depth-slider", "value"),
270
+ State("progress-store", "data"),
271
  prevent_initial_call=True
272
  )
273
+ def update_output(n_clicks, n_intervals, url, depth, progress):
274
  ctx = dash.callback_context
275
  if not ctx.triggered:
276
  raise PreventUpdate
 
279
 
280
  if triggered_id == "submit-button":
281
  if not url:
282
+ return "Please enter a valid URL.", True, 0, ""
283
 
284
+ return "Processing... Please wait.", False, 0, "0%"
285
 
286
  elif triggered_id == "progress-interval":
287
  store = dash.callback_context.inputs.get('pdf-store', None)
288
+ if store is None:
289
+ if progress:
290
+ return "Processing... Please wait.", False, int(progress.split('%')[0]), progress
291
+ return "Processing... Please wait.", False, 0, "0%"
292
 
293
  if isinstance(store, str) and store.startswith("Error"):
294
+ return store, True, 100, "100%"
295
 
296
  try:
297
  encoded = base64.b64encode(store).decode()
 
302
  href=f"data:application/pdf;base64,{encoded}",
303
  download="website_content.pdf"
304
  )
305
+ ]), True, 100, "100%"
306
  except Exception as e:
307
  logger.error(f"Error creating download link: {str(e)}")
308
+ return f"An error occurred while creating the download link: {str(e)}", True, 100, "100%"
309
 
310
  raise PreventUpdate
311
 
312
  @app.callback(
313
  Output('pdf-store', 'data'),
314
+ Output('progress-store', 'data'),
315
  Input('submit-button', 'n_clicks'),
316
  State('url-input', 'value'),
317
  State('depth-slider', 'value'),
 
319
  )
320
  def generate_pdf(n_clicks, url, depth):
321
  if not url:
322
+ return "Please enter a valid URL.", "0%"
323
 
324
+ progress_store = {'progress': "0%"}
325
 
326
  def progress_callback(message):
327
  progress_store['progress'] = message
 
329
  pdf_content = asyncio.run(process_url(url, depth, progress_callback))
330
 
331
  if isinstance(pdf_content, str):
332
+ return pdf_content, "100%" # This is an error message
333
 
334
+ return pdf_content, "100%"
335
 
336
  if __name__ == '__main__':
337
  print("Starting the Dash application...")