Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -195,118 +195,4 @@ def website_to_pdf(all_pages, progress_callback):
|
|
195 |
progress_callback(f"Processing pages... {progress:.0%}")
|
196 |
logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
|
197 |
|
198 |
-
logger.info("
|
199 |
-
output_pdf = os.path.join(temp_dir, "final.pdf")
|
200 |
-
merger = PdfMerger()
|
201 |
-
for temp_file in temp_files:
|
202 |
-
merger.append(temp_file)
|
203 |
-
merger.write(output_pdf)
|
204 |
-
merger.close()
|
205 |
-
|
206 |
-
logger.info("PDF generation complete. Reading final PDF...")
|
207 |
-
with open(output_pdf, 'rb') as f:
|
208 |
-
return f.read()
|
209 |
-
|
210 |
-
async def process_url(url, depth, progress_callback):
|
211 |
-
try:
|
212 |
-
all_pages = await crawl_pages(url, depth)
|
213 |
-
if not all_pages:
|
214 |
-
return "No pages were successfully crawled. Please check the URL and try again."
|
215 |
-
|
216 |
-
logger.info("Crawling complete. Starting PDF generation...")
|
217 |
-
# Use ThreadPoolExecutor to run PDF generation in a separate thread
|
218 |
-
loop = asyncio.get_event_loop()
|
219 |
-
pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
|
220 |
-
logger.info("PDF generation complete.")
|
221 |
-
return pdf_content
|
222 |
-
except Exception as e:
|
223 |
-
logger.error(f"Error in process_url: {str(e)}")
|
224 |
-
return f"An error occurred: {str(e)}"
|
225 |
-
|
226 |
-
# App layout
|
227 |
-
app.layout = dbc.Container([
|
228 |
-
dcc.Store(id='pdf-store'),
|
229 |
-
dcc.Store(id='progress-store'),
|
230 |
-
dbc.Card(
|
231 |
-
dbc.CardBody([
|
232 |
-
html.H1("Website to PDF Converter", className="text-center mb-4"),
|
233 |
-
html.P("Enter docs URL and crawl depth to convert documentation pages into a PDF. Be responsible for sites you have permission to do this", className="text-center mb-4"),
|
234 |
-
dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
|
235 |
-
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
236 |
-
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
237 |
-
dbc.Button("Download PDF", id="download-button", color="secondary", className="mb-3 w-100", disabled=True),
|
238 |
-
html.Div([
|
239 |
-
dbc.Spinner(html.Div(id="progress-message"), color="primary", type="grow", size="lg"),
|
240 |
-
], className="text-center mb-3"),
|
241 |
-
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
242 |
-
dcc.Download(id="download-pdf")
|
243 |
-
]),
|
244 |
-
className="mt-4"
|
245 |
-
)
|
246 |
-
], fluid=True)
|
247 |
-
|
248 |
-
def update_output(n_clicks, n_intervals, progress_data, url, depth):
|
249 |
-
ctx = dash.callback_context
|
250 |
-
if not ctx.triggered:
|
251 |
-
raise PreventUpdate
|
252 |
-
|
253 |
-
triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
|
254 |
-
|
255 |
-
if triggered_id == "submit-button":
|
256 |
-
if not url:
|
257 |
-
return True, "secondary", True, None, "Please enter a URL"
|
258 |
-
|
259 |
-
# Start the background task
|
260 |
-
task_id = str(uuid.uuid4())
|
261 |
-
executor.submit(background_task, url, depth, task_id)
|
262 |
-
|
263 |
-
return True, "secondary", False, None, "Processing... Please wait."
|
264 |
-
|
265 |
-
elif triggered_id == "progress-interval" or triggered_id == "progress-store":
|
266 |
-
if progress_data is None:
|
267 |
-
return True, "secondary", False, None, "Processing... Please wait."
|
268 |
-
|
269 |
-
if isinstance(progress_data, str):
|
270 |
-
if progress_data.startswith("Error"):
|
271 |
-
return True, "secondary", True, None, progress_data
|
272 |
-
else:
|
273 |
-
return True, "secondary", False, None, progress_data
|
274 |
-
|
275 |
-
if isinstance(progress_data, bytes):
|
276 |
-
encoded = base64.b64encode(progress_data).decode()
|
277 |
-
return False, "primary", True, encoded, "PDF ready for download!"
|
278 |
-
|
279 |
-
return True, "secondary", False, None, ""
|
280 |
-
|
281 |
-
@app.callback(
|
282 |
-
Output("download-pdf", "data"),
|
283 |
-
Input("download-button", "n_clicks"),
|
284 |
-
State("pdf-store", "data"),
|
285 |
-
prevent_initial_call=True
|
286 |
-
)
|
287 |
-
def download_pdf(n_clicks, pdf_data):
|
288 |
-
if pdf_data is None:
|
289 |
-
raise PreventUpdate
|
290 |
-
|
291 |
-
decoded = base64.b64decode(pdf_data)
|
292 |
-
return dcc.send_bytes(decoded, f"website_content_{int(time.time())}.pdf")
|
293 |
-
|
294 |
-
def background_task(url, depth, task_id):
|
295 |
-
def progress_callback(message):
|
296 |
-
# Update progress in the progress-store
|
297 |
-
app.layout.children[1].data = message
|
298 |
-
|
299 |
-
try:
|
300 |
-
logger.info(f"Starting background task for URL: {url}, depth: {depth}")
|
301 |
-
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
302 |
-
logger.info("Background task completed successfully")
|
303 |
-
# Store the PDF content directly in the progress-store
|
304 |
-
app.layout.children[1].data = pdf_content
|
305 |
-
except Exception as e:
|
306 |
-
logger.error(f"Error in background task: {str(e)}")
|
307 |
-
app.layout.children[1].data = f"Error: {str(e)}"
|
308 |
-
|
309 |
-
if __name__ == '__main__':
|
310 |
-
print("Starting the Dash application...")
|
311 |
-
app.run(debug=True, host='0.0.0.0', port=7860)
|
312 |
-
print("Dash application has finished running.")
|
|
|
195 |
progress_callback(f"Processing pages... {progress:.0%}")
|
196 |
logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
|
197 |
|
198 |
+
logger.info("Mer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|