File size: 2,819 Bytes
fffbbeb
 
ae5ac9c
 
 
03a594f
 
 
ae5ac9c
 
 
 
 
 
 
fffbbeb
 
 
 
 
ae5ac9c
 
 
 
 
03a594f
 
 
 
 
 
 
 
 
 
 
 
 
fffbbeb
ae5ac9c
03a594f
 
 
 
 
fffbbeb
 
 
 
 
 
 
843d8b6
 
 
 
 
 
 
 
 
 
adf4200
843d8b6
 
 
 
 
03a594f
843d8b6
 
ae5ac9c
03a594f
fffbbeb
d4225f7
2c07938
fffbbeb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from PyPDF2 import PdfReader
import gradio as gr
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from paddleocr import PPStructureV3
from pdf2image import convert_from_path
import numpy as np

pipeline_options = PdfPipelineOptions(enable_remote_services=True)
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

def get_pdf_page_count(pdf_path):
    reader = PdfReader(pdf_path)
    return len(reader.pages)

def get_docling_ocr(pdf_path, page_num):
    result = converter.convert(pdf_path, page_range=(page_num, page_num))
    markdown_text_docling = result.document.export_to_markdown()
    return markdown_text_docling

def get_paddle_ocr(page_image):
    pipeline = PPStructureV3()
    output = pipeline.predict(input=np.array(page_image))

    markdown_list = []

    for res in output:
        md_info = res.markdown
        markdown_list.append(md_info)

    markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
    return markdown_text_paddleOCR

def inference(pdf_path, page_num):
    docling_ocr = get_docling_ocr(pdf_path, page_num)
    # Extract the first page as an image
    images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
    page_image = images[0]
    paddle_ocr = get_paddle_ocr(page_image)
    return docling_ocr, paddle_ocr

title = "OCR Arena"
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
examples = [["data/amazon-10-k-2024.pdf"],
            ["data/goog-10-k-2023.pdf"]]

with gr.Blocks(theme=gr.themes.Glass()) as demo:
    with gr.Row():
        with gr.Column():
            pdf = gr.File(label="Input PDFs", file_types=[".pdf"])

            @gr.render(inputs=pdf)
            def show_slider(pdf_path):
                if pdf_path is None:
                    page_num = gr.Markdown("## No Input Provided")
                else:
                    page_count = get_pdf_page_count(pdf_path)
                    page_num = gr.Slider(1, page_count, value=1, step=1, label="Page Number")

                    with gr.Row():
                        clear_btn = gr.ClearButton(components=[pdf, page_num])
                        submit_btn = gr.Button("Submit", variant='primary')

                    submit_btn.click(inference, inputs=[pdf, page_num], outputs=[docling_ocr_out, paddle_ocr_out])

        with gr.Column():
            docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text")
            paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text")

    examples_obj = gr.Examples(examples=examples, inputs=[pdf])

demo.launch()