Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -151,7 +151,7 @@ async def crawl_pages(base_url, max_depth):
|
|
151 |
|
152 |
return all_pages
|
153 |
|
154 |
-
def website_to_pdf(all_pages):
|
155 |
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
156 |
|
157 |
pdf = FPDF()
|
@@ -159,25 +159,31 @@ def website_to_pdf(all_pages):
|
|
159 |
pdf.add_page()
|
160 |
pdf.set_font("Arial", size=12)
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
for
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
return pdf.output(dest='S').encode('latin-1') # Return bytes instead of BytesIO object
|
174 |
|
175 |
-
async def process_url(url, depth):
|
176 |
try:
|
177 |
all_pages = await crawl_pages(url, depth)
|
178 |
if not all_pages:
|
179 |
return "No pages were successfully crawled. Please check the URL and try again."
|
180 |
-
pdf_content = website_to_pdf(all_pages)
|
181 |
return pdf_content # This is now bytes, not BytesIO
|
182 |
except Exception as e:
|
183 |
logger.error(f"Error in process_url: {str(e)}")
|
@@ -211,6 +217,7 @@ app.layout = dbc.Container([
|
|
211 |
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
212 |
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
213 |
dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
|
|
|
214 |
]),
|
215 |
className="mt-4"
|
216 |
)
|
@@ -218,34 +225,72 @@ app.layout = dbc.Container([
|
|
218 |
|
219 |
@app.callback(
|
220 |
Output("output-area", "children"),
|
|
|
221 |
Input("submit-button", "n_clicks"),
|
|
|
222 |
State("url-input", "value"),
|
223 |
State("depth-slider", "value"),
|
224 |
prevent_initial_call=True
|
225 |
)
|
226 |
-
def update_output(n_clicks, url, depth):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
if not url:
|
228 |
return "Please enter a valid URL."
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
232 |
if isinstance(pdf_content, str):
|
233 |
return pdf_content # This is an error message
|
234 |
-
|
235 |
-
|
236 |
-
encoded = base64.b64encode(pdf_content).decode()
|
237 |
-
|
238 |
-
return html.Div([
|
239 |
-
html.H4("PDF Generated Successfully"),
|
240 |
-
html.A(
|
241 |
-
dbc.Button("Download PDF", color="success", className="mt-2"),
|
242 |
-
href=f"data:application/pdf;base64,{encoded}",
|
243 |
-
download="website_content.pdf"
|
244 |
-
)
|
245 |
-
])
|
246 |
-
except Exception as e:
|
247 |
-
logger.error(f"Error creating download link: {str(e)}")
|
248 |
-
return f"An error occurred while creating the download link: {str(e)}"
|
249 |
|
250 |
if __name__ == '__main__':
|
251 |
print("Starting the Dash application...")
|
|
|
151 |
|
152 |
return all_pages
|
153 |
|
154 |
+
def website_to_pdf(all_pages, progress_callback):
|
155 |
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
156 |
|
157 |
pdf = FPDF()
|
|
|
159 |
pdf.add_page()
|
160 |
pdf.set_font("Arial", size=12)
|
161 |
|
162 |
+
batch_size = 100
|
163 |
+
for i in range(0, len(all_pages), batch_size):
|
164 |
+
batch = all_pages[i:i+batch_size]
|
165 |
+
for page_url, content in batch:
|
166 |
+
pdf.cell(0, 10, txt=page_url, ln=True)
|
167 |
+
pdf.ln(5)
|
168 |
+
for text in content:
|
169 |
+
try:
|
170 |
+
pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
|
171 |
+
except Exception as e:
|
172 |
+
logger.error(f"Error writing text to PDF: {str(e)}")
|
173 |
+
if pdf.get_y() > 250: # Add a new page if the current page is almost full
|
174 |
+
pdf.add_page()
|
175 |
+
|
176 |
+
progress = min((i + batch_size) / len(all_pages), 1.0)
|
177 |
+
progress_callback(f"Processing pages... {progress:.0%}")
|
178 |
|
179 |
return pdf.output(dest='S').encode('latin-1') # Return bytes instead of BytesIO object
|
180 |
|
181 |
+
async def process_url(url, depth, progress_callback):
|
182 |
try:
|
183 |
all_pages = await crawl_pages(url, depth)
|
184 |
if not all_pages:
|
185 |
return "No pages were successfully crawled. Please check the URL and try again."
|
186 |
+
pdf_content = website_to_pdf(all_pages, progress_callback)
|
187 |
return pdf_content # This is now bytes, not BytesIO
|
188 |
except Exception as e:
|
189 |
logger.error(f"Error in process_url: {str(e)}")
|
|
|
217 |
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
218 |
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
219 |
dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
|
220 |
+
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
221 |
]),
|
222 |
className="mt-4"
|
223 |
)
|
|
|
225 |
|
226 |
@app.callback(
|
227 |
Output("output-area", "children"),
|
228 |
+
Output("progress-interval", "disabled"),
|
229 |
Input("submit-button", "n_clicks"),
|
230 |
+
Input("progress-interval", "n_intervals"),
|
231 |
State("url-input", "value"),
|
232 |
State("depth-slider", "value"),
|
233 |
prevent_initial_call=True
|
234 |
)
|
235 |
+
def update_output(n_clicks, n_intervals, url, depth):
|
236 |
+
ctx = dash.callback_context
|
237 |
+
if not ctx.triggered:
|
238 |
+
raise PreventUpdate
|
239 |
+
|
240 |
+
triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
|
241 |
+
|
242 |
+
if triggered_id == "submit-button":
|
243 |
+
if not url:
|
244 |
+
return "Please enter a valid URL.", True
|
245 |
+
|
246 |
+
return dcc.Store(id='pdf-store', data='processing'), False
|
247 |
+
|
248 |
+
elif triggered_id == "progress-interval":
|
249 |
+
store = dash.callback_context.inputs.get('pdf-store', None)
|
250 |
+
if store is None or store == 'processing':
|
251 |
+
return "Processing... Please wait.", False
|
252 |
+
|
253 |
+
if isinstance(store, str) and store.startswith("Error"):
|
254 |
+
return store, True
|
255 |
+
|
256 |
+
try:
|
257 |
+
encoded = base64.b64encode(store).decode()
|
258 |
+
return html.Div([
|
259 |
+
html.H4("PDF Generated Successfully"),
|
260 |
+
html.A(
|
261 |
+
dbc.Button("Download PDF", color="success", className="mt-2"),
|
262 |
+
href=f"data:application/pdf;base64,{encoded}",
|
263 |
+
download="website_content.pdf"
|
264 |
+
)
|
265 |
+
]), True
|
266 |
+
except Exception as e:
|
267 |
+
logger.error(f"Error creating download link: {str(e)}")
|
268 |
+
return f"An error occurred while creating the download link: {str(e)}", True
|
269 |
+
|
270 |
+
raise PreventUpdate
|
271 |
+
|
272 |
+
@app.callback(
|
273 |
+
Output('pdf-store', 'data'),
|
274 |
+
Input('submit-button', 'n_clicks'),
|
275 |
+
State('url-input', 'value'),
|
276 |
+
State('depth-slider', 'value'),
|
277 |
+
prevent_initial_call=True
|
278 |
+
)
|
279 |
+
def generate_pdf(n_clicks, url, depth):
|
280 |
if not url:
|
281 |
return "Please enter a valid URL."
|
282 |
+
|
283 |
+
progress_store = {'progress': 0}
|
284 |
+
|
285 |
+
def progress_callback(message):
|
286 |
+
progress_store['progress'] = message
|
287 |
+
|
288 |
+
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
289 |
+
|
290 |
if isinstance(pdf_content, str):
|
291 |
return pdf_content # This is an error message
|
292 |
+
|
293 |
+
return pdf_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
if __name__ == '__main__':
|
296 |
print("Starting the Dash application...")
|