Spaces:
Running
Running
File size: 5,072 Bytes
fffbbeb ae5ac9c 03a594f be84858 ae5ac9c fffbbeb be84858 fffbbeb 1402288 ae5ac9c 1402288 03a594f 1402288 be84858 fffbbeb 1402288 843d8b6 adf4200 843d8b6 1402288 843d8b6 f449995 3cd5721 fffbbeb d4225f7 2c07938 fffbbeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from PyPDF2 import PdfReader
import gradio as gr
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from paddleocr import PPStructureV3
from pdf2image import convert_from_path
import numpy as np
import torch
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
from pathlib import Path
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
pipeline_options = PdfPipelineOptions(enable_remote_services=True)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
pipeline = PPStructureV3()
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForVision2Seq.from_pretrained(
"ds4sd/SmolDocling-256M-preview",
torch_dtype=torch.bfloat16,
_attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)
def get_pdf_page_count(pdf_path):
reader = PdfReader(pdf_path)
return len(reader.pages)
def get_page_image(pdf_path, page_num):
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
page_image = images[0]
return page_image
def get_docling_ocr(pdf_path, page_num):
result = converter.convert(pdf_path, page_range=(page_num, page_num))
markdown_text_docling = result.document.export_to_markdown()
return markdown_text_docling
def get_paddle_ocr(pdf_path, page_num):
page_image = get_page_image(pdf_path, page_num)
output = pipeline.predict(input=np.array(page_image))
markdown_list = []
for res in output:
md_info = res.markdown
markdown_list.append(md_info)
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
return markdown_text_paddleOCR
def get_smoldocling_ocr(pdf_path, page_num):
page_image = get_page_image(pdf_path, page_num)
image = load_image(page_image)
# Create input messages
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Convert this page to docling."}
]
},
]
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)
generated_ids = model.generate(**inputs, max_new_tokens=8192)
prompt_length = inputs.input_ids.shape[1]
trimmed_generated_ids = generated_ids[:, prompt_length:]
doctags = processor.batch_decode(
trimmed_generated_ids,
skip_special_tokens=False,
)[0].lstrip()
# Populate document
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
markdown_text_smoldocling = doc.export_to_markdown()
return markdown_text_smoldocling
title = "OCR Arena"
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
examples = [["data/amazon-10-k-2024.pdf"],
["data/goog-10-k-2023.pdf"]]
with gr.Blocks(theme=gr.themes.Glass()) as demo:
gr.Markdown(f"# {title}\n{description}")
with gr.Row():
with gr.Column():
pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
@gr.render(inputs=pdf)
def show_slider(pdf_path):
if pdf_path is None:
page_num = gr.Markdown("## No Input Provided")
else:
page_count = get_pdf_page_count(pdf_path)
page_num = gr.Slider(1, page_count, value=1, step=1, label="Page Number")
with gr.Row():
clear_btn = gr.ClearButton(components=[pdf, page_num])
submit_btn = gr.Button("Submit", variant='primary')
submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=original).then(
get_docling_ocr, inputs=[pdf, page_num], outputs=docling_ocr_out).then(
get_paddle_ocr, inputs=[pdf, page_num], outputs=paddle_ocr_out).then(
get_smoldocling_ocr, inputs=[pdf, page_num], outputs=smoldocling_ocr_out)
with gr.Column():
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text", interactive=False)
paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text", interactive=False)
smoldocling_ocr_out = gr.Textbox(label="SmolDocling OCR Output", type="text", interactive=False)
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
demo.launch()
|