Spaces:
Running
Running
Commit
·
8dc2d5d
1
Parent(s):
1402288
Adding time
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ from docling_core.types.doc.document import DocTagsDocument
|
|
12 |
from transformers import AutoProcessor, AutoModelForVision2Seq
|
13 |
from transformers.image_utils import load_image
|
14 |
from pathlib import Path
|
|
|
15 |
|
16 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
|
@@ -36,34 +37,35 @@ def get_pdf_page_count(pdf_path):
|
|
36 |
return len(reader.pages)
|
37 |
|
38 |
def get_page_image(pdf_path, page_num):
|
|
|
39 |
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
40 |
page_image = images[0]
|
41 |
-
|
|
|
42 |
|
43 |
def get_docling_ocr(pdf_path, page_num):
|
|
|
44 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
45 |
markdown_text_docling = result.document.export_to_markdown()
|
46 |
-
|
|
|
47 |
|
48 |
def get_paddle_ocr(pdf_path, page_num):
|
49 |
-
|
50 |
-
|
51 |
output = pipeline.predict(input=np.array(page_image))
|
52 |
-
|
53 |
markdown_list = []
|
54 |
-
|
55 |
for res in output:
|
56 |
md_info = res.markdown
|
57 |
markdown_list.append(md_info)
|
58 |
-
|
59 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
60 |
-
|
|
|
61 |
|
62 |
def get_smoldocling_ocr(pdf_path, page_num):
|
63 |
-
|
|
|
64 |
image = load_image(page_image)
|
65 |
-
|
66 |
-
# Create input messages
|
67 |
messages = [
|
68 |
{
|
69 |
"role": "user",
|
@@ -73,12 +75,9 @@ def get_smoldocling_ocr(pdf_path, page_num):
|
|
73 |
]
|
74 |
},
|
75 |
]
|
76 |
-
|
77 |
-
# Prepare inputs
|
78 |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
79 |
inputs = processor(text=prompt, images=[image], return_tensors="pt")
|
80 |
inputs = inputs.to(DEVICE)
|
81 |
-
|
82 |
generated_ids = model.generate(**inputs, max_new_tokens=8192)
|
83 |
prompt_length = inputs.input_ids.shape[1]
|
84 |
trimmed_generated_ids = generated_ids[:, prompt_length:]
|
@@ -86,13 +85,11 @@ def get_smoldocling_ocr(pdf_path, page_num):
|
|
86 |
trimmed_generated_ids,
|
87 |
skip_special_tokens=False,
|
88 |
)[0].lstrip()
|
89 |
-
|
90 |
-
# Populate document
|
91 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
92 |
doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
|
93 |
-
|
94 |
markdown_text_smoldocling = doc.export_to_markdown()
|
95 |
-
|
|
|
96 |
|
97 |
title = "OCR Arena"
|
98 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
@@ -117,16 +114,20 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
117 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
118 |
submit_btn = gr.Button("Submit", variant='primary')
|
119 |
|
120 |
-
submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=original).then(
|
121 |
-
get_docling_ocr, inputs=[pdf, page_num], outputs=docling_ocr_out).then(
|
122 |
-
get_paddle_ocr, inputs=[pdf, page_num], outputs=paddle_ocr_out).then(
|
123 |
-
get_smoldocling_ocr, inputs=[pdf, page_num], outputs=smoldocling_ocr_out)
|
124 |
|
125 |
with gr.Column():
|
126 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|
|
|
127 |
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text", interactive=False)
|
|
|
128 |
paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text", interactive=False)
|
|
|
129 |
smoldocling_ocr_out = gr.Textbox(label="SmolDocling OCR Output", type="text", interactive=False)
|
|
|
130 |
|
131 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
132 |
|
|
|
12 |
from transformers import AutoProcessor, AutoModelForVision2Seq
|
13 |
from transformers.image_utils import load_image
|
14 |
from pathlib import Path
|
15 |
+
import time
|
16 |
|
17 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
|
|
|
37 |
return len(reader.pages)
|
38 |
|
39 |
def get_page_image(pdf_path, page_num):
|
40 |
+
start = time.time()
|
41 |
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
42 |
page_image = images[0]
|
43 |
+
runtime = time.time() - start
|
44 |
+
return page_image, f"{runtime:.2f} s"
|
45 |
|
46 |
def get_docling_ocr(pdf_path, page_num):
|
47 |
+
start = time.time()
|
48 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
49 |
markdown_text_docling = result.document.export_to_markdown()
|
50 |
+
runtime = time.time() - start
|
51 |
+
return markdown_text_docling, f"{runtime:.2f} s"
|
52 |
|
53 |
def get_paddle_ocr(pdf_path, page_num):
|
54 |
+
start = time.time()
|
55 |
+
page_image = get_page_image(pdf_path, page_num)[0]
|
56 |
output = pipeline.predict(input=np.array(page_image))
|
|
|
57 |
markdown_list = []
|
|
|
58 |
for res in output:
|
59 |
md_info = res.markdown
|
60 |
markdown_list.append(md_info)
|
|
|
61 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
62 |
+
runtime = time.time() - start
|
63 |
+
return markdown_text_paddleOCR, f"{runtime:.2f} s"
|
64 |
|
65 |
def get_smoldocling_ocr(pdf_path, page_num):
|
66 |
+
start = time.time()
|
67 |
+
page_image = get_page_image(pdf_path, page_num)[0]
|
68 |
image = load_image(page_image)
|
|
|
|
|
69 |
messages = [
|
70 |
{
|
71 |
"role": "user",
|
|
|
75 |
]
|
76 |
},
|
77 |
]
|
|
|
|
|
78 |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
79 |
inputs = processor(text=prompt, images=[image], return_tensors="pt")
|
80 |
inputs = inputs.to(DEVICE)
|
|
|
81 |
generated_ids = model.generate(**inputs, max_new_tokens=8192)
|
82 |
prompt_length = inputs.input_ids.shape[1]
|
83 |
trimmed_generated_ids = generated_ids[:, prompt_length:]
|
|
|
85 |
trimmed_generated_ids,
|
86 |
skip_special_tokens=False,
|
87 |
)[0].lstrip()
|
|
|
|
|
88 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
89 |
doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
|
|
|
90 |
markdown_text_smoldocling = doc.export_to_markdown()
|
91 |
+
runtime = time.time() - start
|
92 |
+
return markdown_text_smoldocling, f"{runtime:.2f} s"
|
93 |
|
94 |
title = "OCR Arena"
|
95 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
|
114 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
115 |
submit_btn = gr.Button("Submit", variant='primary')
|
116 |
|
117 |
+
submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=[original, original_runtime]).then(
|
118 |
+
get_docling_ocr, inputs=[pdf, page_num], outputs=[docling_ocr_out, docling_ocr_runtime]).then(
|
119 |
+
get_paddle_ocr, inputs=[pdf, page_num], outputs=[paddle_ocr_out, paddle_ocr_runtime]).then(
|
120 |
+
get_smoldocling_ocr, inputs=[pdf, page_num], outputs=[smoldocling_ocr_out, smoldocling_ocr_runtime])
|
121 |
|
122 |
with gr.Column():
|
123 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|
124 |
+
original_runtime = gr.Textbox(label="Image Extraction Time", type="text", interactive=False)
|
125 |
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text", interactive=False)
|
126 |
+
docling_ocr_runtime = gr.Textbox(label="Docling OCR Time", type="text", interactive=False)
|
127 |
paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text", interactive=False)
|
128 |
+
paddle_ocr_runtime = gr.Textbox(label="Paddle OCR Time", type="text", interactive=False)
|
129 |
smoldocling_ocr_out = gr.Textbox(label="SmolDocling OCR Output", type="text", interactive=False)
|
130 |
+
smoldocling_ocr_runtime = gr.Textbox(label="SmolDocling OCR Time", type="text", interactive=False)
|
131 |
|
132 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
133 |
|