Spaces:
Running
Running
Commit
·
1402288
1
Parent(s):
f449995
Execution changes
Browse files
app.py
CHANGED
@@ -35,12 +35,19 @@ def get_pdf_page_count(pdf_path):
|
|
35 |
reader = PdfReader(pdf_path)
|
36 |
return len(reader.pages)
|
37 |
|
|
|
|
|
|
|
|
|
|
|
38 |
def get_docling_ocr(pdf_path, page_num):
|
39 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
40 |
markdown_text_docling = result.document.export_to_markdown()
|
41 |
return markdown_text_docling
|
42 |
|
43 |
-
def get_paddle_ocr(
|
|
|
|
|
44 |
output = pipeline.predict(input=np.array(page_image))
|
45 |
|
46 |
markdown_list = []
|
@@ -52,7 +59,8 @@ def get_paddle_ocr(page_image):
|
|
52 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
53 |
return markdown_text_paddleOCR
|
54 |
|
55 |
-
def get_smoldocling_ocr(
|
|
|
56 |
image = load_image(page_image)
|
57 |
|
58 |
# Create input messages
|
@@ -85,16 +93,6 @@ def get_smoldocling_ocr(page_image):
|
|
85 |
|
86 |
markdown_text_smoldocling = doc.export_to_markdown()
|
87 |
return markdown_text_smoldocling
|
88 |
-
|
89 |
-
|
90 |
-
def inference(pdf_path, page_num):
|
91 |
-
docling_ocr = get_docling_ocr(pdf_path, page_num)
|
92 |
-
# Extract the first page as an image
|
93 |
-
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
94 |
-
page_image = images[0]
|
95 |
-
paddle_ocr = get_paddle_ocr(page_image)
|
96 |
-
smoldocling_ocr = get_smoldocling_ocr(page_image)
|
97 |
-
return page_image, docling_ocr, paddle_ocr, smoldocling_ocr
|
98 |
|
99 |
title = "OCR Arena"
|
100 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
@@ -102,6 +100,7 @@ examples = [["data/amazon-10-k-2024.pdf"],
|
|
102 |
["data/goog-10-k-2023.pdf"]]
|
103 |
|
104 |
with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
|
105 |
with gr.Row():
|
106 |
with gr.Column():
|
107 |
pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
|
@@ -118,7 +117,10 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
118 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
119 |
submit_btn = gr.Button("Submit", variant='primary')
|
120 |
|
121 |
-
submit_btn.click(
|
|
|
|
|
|
|
122 |
|
123 |
with gr.Column():
|
124 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|
|
|
35 |
reader = PdfReader(pdf_path)
|
36 |
return len(reader.pages)
|
37 |
|
38 |
+
def get_page_image(pdf_path, page_num):
|
39 |
+
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
40 |
+
page_image = images[0]
|
41 |
+
return page_image
|
42 |
+
|
43 |
def get_docling_ocr(pdf_path, page_num):
|
44 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
45 |
markdown_text_docling = result.document.export_to_markdown()
|
46 |
return markdown_text_docling
|
47 |
|
48 |
+
def get_paddle_ocr(pdf_path, page_num):
|
49 |
+
page_image = get_page_image(pdf_path, page_num)
|
50 |
+
|
51 |
output = pipeline.predict(input=np.array(page_image))
|
52 |
|
53 |
markdown_list = []
|
|
|
59 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
60 |
return markdown_text_paddleOCR
|
61 |
|
62 |
+
def get_smoldocling_ocr(pdf_path, page_num):
|
63 |
+
page_image = get_page_image(pdf_path, page_num)
|
64 |
image = load_image(page_image)
|
65 |
|
66 |
# Create input messages
|
|
|
93 |
|
94 |
markdown_text_smoldocling = doc.export_to_markdown()
|
95 |
return markdown_text_smoldocling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
title = "OCR Arena"
|
98 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
|
100 |
["data/goog-10-k-2023.pdf"]]
|
101 |
|
102 |
with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
103 |
+
gr.Markdown(f"# {title}\n{description}")
|
104 |
with gr.Row():
|
105 |
with gr.Column():
|
106 |
pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
|
|
|
117 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
118 |
submit_btn = gr.Button("Submit", variant='primary')
|
119 |
|
120 |
+
submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=original).then(
|
121 |
+
get_docling_ocr, inputs=[pdf, page_num], outputs=docling_ocr_out).then(
|
122 |
+
get_paddle_ocr, inputs=[pdf, page_num], outputs=paddle_ocr_out).then(
|
123 |
+
get_smoldocling_ocr, inputs=[pdf, page_num], outputs=smoldocling_ocr_out)
|
124 |
|
125 |
with gr.Column():
|
126 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|