from PyPDF2 import PdfReader import gradio as gr from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.base_models import InputFormat from paddleocr import PPStructureV3 from pdf2image import convert_from_path import numpy as np pipeline_options = PdfPipelineOptions(enable_remote_services=True) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) } ) def get_pdf_page_count(pdf_path): reader = PdfReader(pdf_path) return len(reader.pages) def get_docling_ocr(pdf_path, page_num): result = converter.convert(pdf_path, page_range=(page_num, page_num)) markdown_text_docling = result.document.export_to_markdown() return markdown_text_docling def get_paddle_ocr(page_image): pipeline = PPStructureV3() output = pipeline.predict(input=np.array(page_image)) markdown_list = [] for res in output: md_info = res.markdown markdown_list.append(md_info) markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list) return markdown_text_paddleOCR def inference(pdf_path, page_num): docling_ocr = get_docling_ocr(pdf_path, page_num) # Extract the first page as an image images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num) page_image = images[0] paddle_ocr = get_paddle_ocr(page_image) return docling_ocr, paddle_ocr title = "OCR Arena" description = "A simple Gradio interface to extract text from PDFs and compare OCR models" examples = [["data/amazon-10-k-2024.pdf"], ["data/goog-10-k-2023.pdf"]] with gr.Blocks(theme=gr.themes.Glass()) as demo: with gr.Row(): with gr.Column(): pdf = gr.File(label="Input PDFs", file_types=[".pdf"]) @gr.render(inputs=pdf) def show_slider(pdf_path): if pdf_path is None: page_num = gr.Markdown("## No Input Provided") else: page_count = get_pdf_page_count(pdf_path) page_num = gr.Slider(1, page_count, value=1, step=1, label="Page Number") with gr.Row(): clear_btn = gr.ClearButton(components=[pdf, page_num]) submit_btn = gr.Button("Submit", variant='primary') submit_btn.click(inference, inputs=[pdf, page_num], outputs=[docling_ocr_out, paddle_ocr_out]) with gr.Column(): docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text") paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text") examples_obj = gr.Examples(examples=examples, inputs=[pdf]) demo.launch()