jkorstad commited on
Commit
73efe67
·
verified ·
1 Parent(s): 455b7c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -11
app.py CHANGED
@@ -22,26 +22,30 @@ except ValueError as e:
22
  @spaces.GPU
23
  def process_pdf(pdf_file):
24
  """
25
- Process the uploaded PDF file, extract text from each page, and generate HTML
26
- to display each page's image and text with copy buttons.
27
  """
28
  if processor is None or model is None:
29
- return "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
 
30
 
31
  # Check if a PDF file was uploaded
32
  if pdf_file is None:
33
- return "<p>Please upload a PDF file.</p>"
 
34
 
35
  # Convert PDF to images
36
  try:
37
  pages = convert_from_path(pdf_file.name)
38
  except Exception as e:
39
- return f"<p>Error converting PDF to images: {str(e)}</p>"
 
40
 
41
- # Start building the HTML output
42
  html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
 
43
 
44
- # Process each page
45
  for i, page in enumerate(pages):
46
  # Convert the page image to base64 for embedding in HTML
47
  buffered = io.BytesIO()
@@ -59,7 +63,7 @@ def process_pdf(pdf_file):
59
 
60
  # Generate HTML for this page's section
61
  textarea_id = f"text{i+1}"
62
- html += f'''
63
  <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
64
  <h3>Page {i+1}</h3>
65
  <div style="display: flex; align-items: flex-start;">
@@ -71,8 +75,12 @@ def process_pdf(pdf_file):
71
  </div>
72
  </div>
73
  '''
 
 
 
 
74
 
75
- # Close the pages div and add JavaScript for copy functionality
76
  html += '</div>'
77
  html += '''
78
  <script>
@@ -88,12 +96,12 @@ def process_pdf(pdf_file):
88
  }
89
  </script>
90
  '''
91
- return html
92
 
93
  # Define the Gradio interface
94
  with gr.Blocks(title="PDF Text Extractor") as demo:
95
  gr.Markdown("# PDF Text Extractor")
96
- gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.")
97
  with gr.Row():
98
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
99
  submit_btn = gr.Button("Extract Text")
 
22
  @spaces.GPU
23
  def process_pdf(pdf_file):
24
  """
25
+ Process the uploaded PDF file one page at a time, yielding HTML for each page
26
+ with its image and extracted text.
27
  """
28
  if processor is None or model is None:
29
+ yield "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
30
+ return
31
 
32
  # Check if a PDF file was uploaded
33
  if pdf_file is None:
34
+ yield "<p>Please upload a PDF file.</p>"
35
+ return
36
 
37
  # Convert PDF to images
38
  try:
39
  pages = convert_from_path(pdf_file.name)
40
  except Exception as e:
41
+ yield f"<p>Error converting PDF to images: {str(e)}</p>"
42
+ return
43
 
44
+ # Initial HTML with "Copy All" button and container for pages
45
  html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
46
+ yield html # Start with the header
47
 
48
+ # Process each page incrementally
49
  for i, page in enumerate(pages):
50
  # Convert the page image to base64 for embedding in HTML
51
  buffered = io.BytesIO()
 
63
 
64
  # Generate HTML for this page's section
65
  textarea_id = f"text{i+1}"
66
+ page_html = f'''
67
  <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
68
  <h3>Page {i+1}</h3>
69
  <div style="display: flex; align-items: flex-start;">
 
75
  </div>
76
  </div>
77
  '''
78
+
79
+ # Append this page to the existing HTML and yield the updated content
80
+ html += page_html
81
+ yield html
82
 
83
+ # After all pages are processed, close the div and add JavaScript
84
  html += '</div>'
85
  html += '''
86
  <script>
 
96
  }
97
  </script>
98
  '''
99
+ yield html # Final yield with complete content and scripts
100
 
101
  # Define the Gradio interface
102
  with gr.Blocks(title="PDF Text Extractor") as demo:
103
  gr.Markdown("# PDF Text Extractor")
104
+ gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text incrementally.")
105
  with gr.Row():
106
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
107
  submit_btn = gr.Button("Extract Text")