Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -22,26 +22,30 @@ except ValueError as e:
|
|
22 |
@spaces.GPU
|
23 |
def process_pdf(pdf_file):
|
24 |
"""
|
25 |
-
Process the uploaded PDF file
|
26 |
-
|
27 |
"""
|
28 |
if processor is None or model is None:
|
29 |
-
|
|
|
30 |
|
31 |
# Check if a PDF file was uploaded
|
32 |
if pdf_file is None:
|
33 |
-
|
|
|
34 |
|
35 |
# Convert PDF to images
|
36 |
try:
|
37 |
pages = convert_from_path(pdf_file.name)
|
38 |
except Exception as e:
|
39 |
-
|
|
|
40 |
|
41 |
-
#
|
42 |
html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
|
|
|
43 |
|
44 |
-
# Process each page
|
45 |
for i, page in enumerate(pages):
|
46 |
# Convert the page image to base64 for embedding in HTML
|
47 |
buffered = io.BytesIO()
|
@@ -59,7 +63,7 @@ def process_pdf(pdf_file):
|
|
59 |
|
60 |
# Generate HTML for this page's section
|
61 |
textarea_id = f"text{i+1}"
|
62 |
-
|
63 |
<div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
|
64 |
<h3>Page {i+1}</h3>
|
65 |
<div style="display: flex; align-items: flex-start;">
|
@@ -71,8 +75,12 @@ def process_pdf(pdf_file):
|
|
71 |
</div>
|
72 |
</div>
|
73 |
'''
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
#
|
76 |
html += '</div>'
|
77 |
html += '''
|
78 |
<script>
|
@@ -88,12 +96,12 @@ def process_pdf(pdf_file):
|
|
88 |
}
|
89 |
</script>
|
90 |
'''
|
91 |
-
|
92 |
|
93 |
# Define the Gradio interface
|
94 |
with gr.Blocks(title="PDF Text Extractor") as demo:
|
95 |
gr.Markdown("# PDF Text Extractor")
|
96 |
-
gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text.")
|
97 |
with gr.Row():
|
98 |
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
99 |
submit_btn = gr.Button("Extract Text")
|
|
|
22 |
@spaces.GPU
|
23 |
def process_pdf(pdf_file):
|
24 |
"""
|
25 |
+
Process the uploaded PDF file one page at a time, yielding HTML for each page
|
26 |
+
with its image and extracted text.
|
27 |
"""
|
28 |
if processor is None or model is None:
|
29 |
+
yield "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
|
30 |
+
return
|
31 |
|
32 |
# Check if a PDF file was uploaded
|
33 |
if pdf_file is None:
|
34 |
+
yield "<p>Please upload a PDF file.</p>"
|
35 |
+
return
|
36 |
|
37 |
# Convert PDF to images
|
38 |
try:
|
39 |
pages = convert_from_path(pdf_file.name)
|
40 |
except Exception as e:
|
41 |
+
yield f"<p>Error converting PDF to images: {str(e)}</p>"
|
42 |
+
return
|
43 |
|
44 |
+
# Initial HTML with "Copy All" button and container for pages
|
45 |
html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
|
46 |
+
yield html # Start with the header
|
47 |
|
48 |
+
# Process each page incrementally
|
49 |
for i, page in enumerate(pages):
|
50 |
# Convert the page image to base64 for embedding in HTML
|
51 |
buffered = io.BytesIO()
|
|
|
63 |
|
64 |
# Generate HTML for this page's section
|
65 |
textarea_id = f"text{i+1}"
|
66 |
+
page_html = f'''
|
67 |
<div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
|
68 |
<h3>Page {i+1}</h3>
|
69 |
<div style="display: flex; align-items: flex-start;">
|
|
|
75 |
</div>
|
76 |
</div>
|
77 |
'''
|
78 |
+
|
79 |
+
# Append this page to the existing HTML and yield the updated content
|
80 |
+
html += page_html
|
81 |
+
yield html
|
82 |
|
83 |
+
# After all pages are processed, close the div and add JavaScript
|
84 |
html += '</div>'
|
85 |
html += '''
|
86 |
<script>
|
|
|
96 |
}
|
97 |
</script>
|
98 |
'''
|
99 |
+
yield html # Final yield with complete content and scripts
|
100 |
|
101 |
# Define the Gradio interface
|
102 |
with gr.Blocks(title="PDF Text Extractor") as demo:
|
103 |
gr.Markdown("# PDF Text Extractor")
|
104 |
+
gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text incrementally.")
|
105 |
with gr.Row():
|
106 |
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
107 |
submit_btn = gr.Button("Extract Text")
|