Spaces:

jkorstad
/

PDF-Parser

Runtime error

App Files Files Community

jkorstad commited on Feb 26

Commit

73efe67

verified ·

1 Parent(s): 455b7c3

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -11

app.py CHANGED Viewed

@@ -22,26 +22,30 @@ except ValueError as e:
 @spaces.GPU
 def process_pdf(pdf_file):
     """
-    Process the uploaded PDF file, extract text from each page, and generate HTML
-    to display each page's image and text with copy buttons.
     """
     if processor is None or model is None:
-        return "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
     # Check if a PDF file was uploaded
     if pdf_file is None:
-        return "<p>Please upload a PDF file.</p>"
     # Convert PDF to images
     try:
         pages = convert_from_path(pdf_file.name)
     except Exception as e:
-        return f"<p>Error converting PDF to images: {str(e)}</p>"
-    # Start building the HTML output
     html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
-    # Process each page
     for i, page in enumerate(pages):
         # Convert the page image to base64 for embedding in HTML
         buffered = io.BytesIO()
@@ -59,7 +63,7 @@ def process_pdf(pdf_file):
         # Generate HTML for this page's section
         textarea_id = f"text{i+1}"
-        html += f'''
         <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
             <h3>Page {i+1}</h3>
             <div style="display: flex; align-items: flex-start;">
@@ -71,8 +75,12 @@ def process_pdf(pdf_file):
             </div>
         </div>
         '''
-    # Close the pages div and add JavaScript for copy functionality
     html += '</div>'
     html += '''
     <script>
@@ -88,12 +96,12 @@ def process_pdf(pdf_file):
     }
     </script>
     '''
-    return html
 # Define the Gradio interface
 with gr.Blocks(title="PDF Text Extractor") as demo:
     gr.Markdown("# PDF Text Extractor")
-    gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.")
     with gr.Row():
         pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
         submit_btn = gr.Button("Extract Text")

 @spaces.GPU
 def process_pdf(pdf_file):
     """
+    Process the uploaded PDF file one page at a time, yielding HTML for each page
+    with its image and extracted text.
     """
     if processor is None or model is None:
+        yield "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
+        return
     # Check if a PDF file was uploaded
     if pdf_file is None:
+        yield "<p>Please upload a PDF file.</p>"
+        return
     # Convert PDF to images
     try:
         pages = convert_from_path(pdf_file.name)
     except Exception as e:
+        yield f"<p>Error converting PDF to images: {str(e)}</p>"
+        return
+    # Initial HTML with "Copy All" button and container for pages
     html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
+    yield html  # Start with the header
+    # Process each page incrementally
     for i, page in enumerate(pages):
         # Convert the page image to base64 for embedding in HTML
         buffered = io.BytesIO()
         # Generate HTML for this page's section
         textarea_id = f"text{i+1}"
+        page_html = f'''
         <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
             <h3>Page {i+1}</h3>
             <div style="display: flex; align-items: flex-start;">
             </div>
         </div>
         '''
+        # Append this page to the existing HTML and yield the updated content
+        html += page_html
+        yield html
+    # After all pages are processed, close the div and add JavaScript
     html += '</div>'
     html += '''
     <script>
     }
     </script>
     '''
+    yield html  # Final yield with complete content and scripts
 # Define the Gradio interface
 with gr.Blocks(title="PDF Text Extractor") as demo:
     gr.Markdown("# PDF Text Extractor")
+    gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text incrementally.")
     with gr.Row():
         pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
         submit_btn = gr.Button("Extract Text")