Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -16,34 +16,32 @@ def process_pdf(pdf_file):
|
|
16 |
"""
|
17 |
# Check if a PDF file was uploaded
|
18 |
if pdf_file is None:
|
19 |
-
|
20 |
-
return "<p>Please upload a PDF file.</p>"
|
21 |
|
22 |
# Convert PDF to images
|
23 |
try:
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
return f"<p>Error converting PDF to images: {str(e)}</p>"
|
28 |
-
|
29 |
-
# Start building the HTML output
|
30 |
-
html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
|
40 |
# Extract text from the page using the OCR model
|
41 |
try:
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
48 |
# Generate HTML for this page's section
|
49 |
textarea_id = f"text{i+1}"
|
@@ -89,4 +87,5 @@ with gr.Blocks(title="PDF Text Extractor") as demo:
|
|
89 |
submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)
|
90 |
|
91 |
# Launch the interface
|
92 |
-
demo.launch()
|
|
|
|
16 |
"""
|
17 |
# Check if a PDF file was uploaded
|
18 |
if pdf_file is None:
|
19 |
+
return "<p>Please upload a PDF file.</p>"
|
|
|
20 |
|
21 |
# Convert PDF to images
|
22 |
try:
|
23 |
+
pages = convert_from_path(pdf_file.name)
|
24 |
+
except Exception as e:
|
25 |
+
return f"<p>Error converting PDF to images: {str(e)}</p>"
|
26 |
|
27 |
+
# Start building the HTML output
|
28 |
+
html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
# Process each page
|
31 |
+
for i, page in enumerate(pages):
|
32 |
+
# Convert the page image to base64 for embedding in HTML
|
33 |
+
buffered = io.BytesIO()
|
34 |
+
page.save(buffered, format="PNG")
|
35 |
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
36 |
+
img_data = f"data:image/png;base64,{img_str}"
|
37 |
|
38 |
# Extract text from the page using the OCR model
|
39 |
try:
|
40 |
+
inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
|
41 |
+
outputs = model.generate(**inputs)
|
42 |
+
text = processor.decode(outputs[0], skip_special_tokens=True)
|
43 |
+
except Exception as e:
|
44 |
+
text = f"Error extracting text: {str(e)}"
|
45 |
|
46 |
# Generate HTML for this page's section
|
47 |
textarea_id = f"text{i+1}"
|
|
|
87 |
submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)
|
88 |
|
89 |
# Launch the interface
|
90 |
+
demo.launch()
|
91 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|