Spaces:

MicroHealth
/

website-to-pdf

Sleeping

App Files Files Community

bluenevus commited on Apr 25

Commit

512b4c4

verified ·

1 Parent(s): 69b83fd

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -85

app.py CHANGED Viewed

@@ -19,17 +19,16 @@ import time
 import os
 import ssl
 from io import BytesIO
 from concurrent.futures import ThreadPoolExecutor
-import math
-from PyPDF2 import PdfMerger
 # Initialize Dash app
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
 server = app.server
 # Logging setup
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Thread-local storage for database connections
@@ -43,6 +42,9 @@ ssl_context = ssl.create_default_context()
 ssl_context.check_hostname = False
 ssl_context.verify_mode = ssl.CERT_NONE
 @contextmanager
 def get_db_connection():
     if not hasattr(thread_local, "connection"):
@@ -112,7 +114,7 @@ async def get_links(session, url, base_url):
         logger.error(f"Error getting links from {url}: {str(e)}")
         return []
-async def crawl_pages(base_url, max_depth, progress_callback):
     visited = set()
     to_visit = [(base_url, 0)]
     all_pages = []
@@ -144,9 +146,6 @@ async def crawl_pages(base_url, max_depth, progress_callback):
                 all_pages.append((current_url, content))
                 logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
-                progress = len(all_pages) / (max_depth * 10)  # Rough estimate
-                progress_callback(f"Crawling pages... {progress:.0%}")
                 if depth < max_depth:
                     links = await get_links(session, current_url, base_url)
                     for link in links:
@@ -158,70 +157,63 @@ async def crawl_pages(base_url, max_depth, progress_callback):
     return all_pages
-def create_pdf_chunk(chunk, start_index):
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
-    for i, (page_url, content) in enumerate(chunk, start=start_index):
-        pdf.cell(0, 10, txt=f"Page {i+1}: {page_url}", ln=True)
         pdf.ln(5)
         for text in content:
             try:
                 pdf.multi_cell(0, 10, txt=text[:200])  # Limit text length to avoid issues
             except Exception as e:
                 logger.error(f"Error writing text to PDF: {str(e)}")
-        pdf.add_page()
-    return pdf.output(dest='S').encode('latin-1')
-async def website_to_pdf(all_pages, progress_callback):
     logger.info(f"Starting PDF generation for {len(all_pages)} pages")
     chunk_size = 100
-    num_chunks = math.ceil(len(all_pages) / chunk_size)
-    pdf_chunks = []
-    with ThreadPoolExecutor() as executor:
-        futures = []
-        for i in range(num_chunks):
-            start = i * chunk_size
-            end = min((i + 1) * chunk_size, len(all_pages))
-            chunk = all_pages[start:end]
-            future = executor.submit(create_pdf_chunk, chunk, start)
-            futures.append(future)
-        for i, future in enumerate(futures):
-            try:
-                pdf_chunk = await asyncio.wrap_future(future)
-                pdf_chunks.append(pdf_chunk)
-                progress = (i + 1) / num_chunks
-                progress_callback(f"Generating PDF... {progress:.0%}")
-            except Exception as e:
-                logger.error(f"Error generating PDF chunk {i}: {str(e)}")
-    # Combine PDF chunks using PyPDF2
-    merger = PdfMerger()
-    for chunk in pdf_chunks:
-        merger.append(BytesIO(chunk))
-    output = BytesIO()
-    merger.write(output)
-    merger.close()
-    return output.getvalue()
 async def process_url(url, depth, progress_callback):
     try:
-        all_pages = await asyncio.wait_for(crawl_pages(url, depth, progress_callback), timeout=3600)  # 1 hour timeout
         if not all_pages:
             return "No pages were successfully crawled. Please check the URL and try again."
-        pdf_content = await asyncio.wait_for(website_to_pdf(all_pages, progress_callback), timeout=3600)  # 1 hour timeout for PDF generation
         return pdf_content
-    except asyncio.TimeoutError:
-        logger.error("Process timed out after 1 hour")
-        return "The process timed out after 1 hour. Please try again with a smaller depth or a more specific URL."
     except Exception as e:
         logger.error(f"Error in process_url: {str(e)}")
         return f"An error occurred: {str(e)}"
@@ -255,7 +247,7 @@ app.layout = dbc.Container([
             dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
             dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
             dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
-            dbc.Progress(id="progress-bar", animated=True, striped=True, className="mb-3"),
             dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
             dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
         ]),
@@ -266,16 +258,14 @@ app.layout = dbc.Container([
 @app.callback(
     Output("output-area", "children"),
     Output("progress-interval", "disabled"),
-    Output("progress-bar", "value"),
-    Output("progress-bar", "label"),
     Input("submit-button", "n_clicks"),
     Input("progress-interval", "n_intervals"),
     State("url-input", "value"),
     State("depth-slider", "value"),
-    State("progress-store", "data"),
     prevent_initial_call=True
 )
-def update_output(n_clicks, n_intervals, url, depth, progress):
     ctx = dash.callback_context
     if not ctx.triggered:
         raise PreventUpdate
@@ -284,22 +274,29 @@ def update_output(n_clicks, n_intervals, url, depth, progress):
     if triggered_id == "submit-button":
         if not url:
-            return "Please enter a valid URL.", True, 0, ""
-        return "Processing... Please wait.", False, 0, "0%"
     elif triggered_id == "progress-interval":
-        store = dash.callback_context.inputs.get('pdf-store', None)
-        if store is None:
-            if progress:
-                return "Processing... Please wait.", False, int(progress.split('%')[0]), progress
-            return "Processing... Please wait.", False, 0, "0%"
-        if isinstance(store, str) and store.startswith("Error"):
-            return store, True, 100, "100%"
         try:
-            encoded = base64.b64encode(store).decode()
             return html.Div([
                 html.H4("PDF Generated Successfully"),
                 html.A(
@@ -307,36 +304,38 @@ def update_output(n_clicks, n_intervals, url, depth, progress):
                     href=f"data:application/pdf;base64,{encoded}",
                     download="website_content.pdf"
                 )
-            ]), True, 100, "100%"
         except Exception as e:
             logger.error(f"Error creating download link: {str(e)}")
-            return f"An error occurred while creating the download link: {str(e)}", True, 100, "100%"
     raise PreventUpdate
 @app.callback(
-    Output('pdf-store', 'data'),
     Output('progress-store', 'data'),
-    Input('submit-button', 'n_clicks'),
-    State('url-input', 'value'),
-    State('depth-slider', 'value'),
     prevent_initial_call=True
 )
-def generate_pdf(n_clicks, url, depth):
-    if not url:
-        return "Please enter a valid URL.", "0%"
-    progress_store = {'progress': "0%"}
     def progress_callback(message):
-        progress_store['progress'] = message
-    pdf_content = asyncio.run(process_url(url, depth, progress_callback))
-    if isinstance(pdf_content, str):
-        return pdf_content, "100%"  # This is an error message
-    return pdf_content, "100%"
 if __name__ == '__main__':
     print("Starting the Dash application...")

 import os
 import ssl
 from io import BytesIO
+import tempfile
+import uuid
 from concurrent.futures import ThreadPoolExecutor
 # Initialize Dash app
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
 server = app.server
 # Logging setup
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Thread-local storage for database connections
 ssl_context.check_hostname = False
 ssl_context.verify_mode = ssl.CERT_NONE
+# ThreadPoolExecutor for background tasks
+executor = ThreadPoolExecutor(max_workers=4)
 @contextmanager
 def get_db_connection():
     if not hasattr(thread_local, "connection"):
         logger.error(f"Error getting links from {url}: {str(e)}")
         return []
+async def crawl_pages(base_url, max_depth):
     visited = set()
     to_visit = [(base_url, 0)]
     all_pages = []
                 all_pages.append((current_url, content))
                 logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
                 if depth < max_depth:
                     links = await get_links(session, current_url, base_url)
                     for link in links:
     return all_pages
+def generate_pdf_chunk(chunk, output_file):
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
+    for page_url, content in chunk:
+        pdf.cell(0, 10, txt=page_url, ln=True)
         pdf.ln(5)
         for text in content:
             try:
                 pdf.multi_cell(0, 10, txt=text[:200])  # Limit text length to avoid issues
             except Exception as e:
                 logger.error(f"Error writing text to PDF: {str(e)}")
+        if pdf.get_y() > 250:  # Add a new page if the current page is almost full
+            pdf.add_page()
+    pdf.output(output_file)
+def website_to_pdf(all_pages, progress_callback):
     logger.info(f"Starting PDF generation for {len(all_pages)} pages")
     chunk_size = 100
+    total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
+    temp_files = []
+    with tempfile.TemporaryDirectory() as temp_dir:
+        for i in range(0, len(all_pages), chunk_size):
+            chunk = all_pages[i:i+chunk_size]
+            temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
+            generate_pdf_chunk(chunk, temp_file)
+            temp_files.append(temp_file)
+            progress = min((i + chunk_size) / len(all_pages), 1.0)
+            progress_callback(f"Processing pages... {progress:.0%}")
+        # Merge PDF chunks
+        output_pdf = os.path.join(temp_dir, "final.pdf")
+        merger = PdfMerger()
+        for temp_file in temp_files:
+            merger.append(temp_file)
+        merger.write(output_pdf)
+        merger.close()
+        with open(output_pdf, 'rb') as f:
+            return f.read()
 async def process_url(url, depth, progress_callback):
     try:
+        all_pages = await crawl_pages(url, depth)
         if not all_pages:
             return "No pages were successfully crawled. Please check the URL and try again."
+        # Use ThreadPoolExecutor to run PDF generation in a separate thread
+        loop = asyncio.get_event_loop()
+        pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
         return pdf_content
     except Exception as e:
         logger.error(f"Error in process_url: {str(e)}")
         return f"An error occurred: {str(e)}"
             dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
             dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
             dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
+            dbc.Progress(id="progress-bar", style={"visibility": "hidden"}),
             dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
             dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
         ]),
 @app.callback(
     Output("output-area", "children"),
     Output("progress-interval", "disabled"),
+    Output("progress-bar", "style"),
     Input("submit-button", "n_clicks"),
     Input("progress-interval", "n_intervals"),
     State("url-input", "value"),
     State("depth-slider", "value"),
     prevent_initial_call=True
 )
+def update_output(n_clicks, n_intervals, url, depth):
     ctx = dash.callback_context
     if not ctx.triggered:
         raise PreventUpdate
     if triggered_id == "submit-button":
         if not url:
+            return "Please enter a valid URL.", True, {"visibility": "hidden"}
+        # Start the background task
+        task_id = str(uuid.uuid4())
+        executor.submit(background_task, url, depth, task_id)
+        return "Processing... Please wait.", False, {"visibility": "visible"}
     elif triggered_id == "progress-interval":
+        # Check progress
+        progress = dash.callback_context.inputs['progress-store.data']
+        if progress is None:
+            return "Processing... Please wait.", False, {"visibility": "visible"}
+        if isinstance(progress, str) and progress.startswith("Error"):
+            return progress, True, {"visibility": "hidden"}
+        if isinstance(progress, str) and progress.startswith("Processing"):
+            return progress, False, {"visibility": "visible"}
+        # PDF generation complete
         try:
+            encoded = base64.b64encode(progress).decode()
             return html.Div([
                 html.H4("PDF Generated Successfully"),
                 html.A(
                     href=f"data:application/pdf;base64,{encoded}",
                     download="website_content.pdf"
                 )
+            ]), True, {"visibility": "hidden"}
         except Exception as e:
             logger.error(f"Error creating download link: {str(e)}")
+            return f"An error occurred while creating the download link: {str(e)}", True, {"visibility": "hidden"}
     raise PreventUpdate
 @app.callback(
     Output('progress-store', 'data'),
+    Input('progress-interval', 'n_intervals'),
     prevent_initial_call=True
 )
+def update_progress(n):
+    # This function will be called every second to update the progress
+    # You can implement a mechanism to check the actual progress of the PDF generation
+    # For now, we'll just return a placeholder message
+    return "Processing... Please wait."
+def background_task(url, depth, task_id):
     def progress_callback(message):
+        # Update progress in the database or a shared data structure
+        pass
+    try:
+        pdf_content = asyncio.run(process_url(url, depth, progress_callback))
+        # Store the result in a database or shared data structure
+        # For simplicity, we'll use the progress-store, but in a real application,
+        # you should use a more robust solution for storing large data
+        app.layout.children[1].data = pdf_content
+    except Exception as e:
+        logger.error(f"Error in background task: {str(e)}")
+        app.layout.children[1].data = f"Error: {str(e)}"
 if __name__ == '__main__':
     print("Starting the Dash application...")