Spaces:
Sleeping
Sleeping
File size: 2,819 Bytes
fffbbeb ae5ac9c 03a594f ae5ac9c fffbbeb ae5ac9c 03a594f fffbbeb ae5ac9c 03a594f fffbbeb 843d8b6 adf4200 843d8b6 03a594f 843d8b6 ae5ac9c 03a594f fffbbeb d4225f7 2c07938 fffbbeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
from PyPDF2 import PdfReader
import gradio as gr
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from paddleocr import PPStructureV3
from pdf2image import convert_from_path
import numpy as np
pipeline_options = PdfPipelineOptions(enable_remote_services=True)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
def get_pdf_page_count(pdf_path):
reader = PdfReader(pdf_path)
return len(reader.pages)
def get_docling_ocr(pdf_path, page_num):
result = converter.convert(pdf_path, page_range=(page_num, page_num))
markdown_text_docling = result.document.export_to_markdown()
return markdown_text_docling
def get_paddle_ocr(page_image):
pipeline = PPStructureV3()
output = pipeline.predict(input=np.array(page_image))
markdown_list = []
for res in output:
md_info = res.markdown
markdown_list.append(md_info)
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
return markdown_text_paddleOCR
def inference(pdf_path, page_num):
docling_ocr = get_docling_ocr(pdf_path, page_num)
# Extract the first page as an image
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
page_image = images[0]
paddle_ocr = get_paddle_ocr(page_image)
return docling_ocr, paddle_ocr
title = "OCR Arena"
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
examples = [["data/amazon-10-k-2024.pdf"],
["data/goog-10-k-2023.pdf"]]
with gr.Blocks(theme=gr.themes.Glass()) as demo:
with gr.Row():
with gr.Column():
pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
@gr.render(inputs=pdf)
def show_slider(pdf_path):
if pdf_path is None:
page_num = gr.Markdown("## No Input Provided")
else:
page_count = get_pdf_page_count(pdf_path)
page_num = gr.Slider(1, page_count, value=1, step=1, label="Page Number")
with gr.Row():
clear_btn = gr.ClearButton(components=[pdf, page_num])
submit_btn = gr.Button("Submit", variant='primary')
submit_btn.click(inference, inputs=[pdf, page_num], outputs=[docling_ocr_out, paddle_ocr_out])
with gr.Column():
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text")
paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text")
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
demo.launch()
|