Spaces:
Running
Running
Commit
·
ae5ac9c
1
Parent(s):
adf4200
Adding Docling
Browse files- app.py +19 -3
- requirements.txt +55 -0
app.py
CHANGED
@@ -1,12 +1,28 @@
|
|
1 |
from PyPDF2 import PdfReader
|
2 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def get_pdf_page_count(pdf_path):
|
5 |
reader = PdfReader(pdf_path)
|
6 |
return len(reader.pages)
|
7 |
|
|
|
|
|
|
|
|
|
|
|
8 |
def inference(pdf_path, page_num):
|
9 |
-
|
|
|
10 |
|
11 |
title = "OCR Arena"
|
12 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
@@ -30,10 +46,10 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
30 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
31 |
submit_btn = gr.Button("Submit", variant='primary')
|
32 |
|
33 |
-
submit_btn.click(inference, inputs=[pdf, page_num], outputs=
|
34 |
|
35 |
with gr.Column():
|
36 |
-
|
37 |
|
38 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
39 |
|
|
|
1 |
from PyPDF2 import PdfReader
|
2 |
import gradio as gr
|
3 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
4 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
5 |
+
from docling.datamodel.base_models import InputFormat
|
6 |
+
|
7 |
+
pipeline_options = PdfPipelineOptions(enable_remote_services=True)
|
8 |
+
converter = DocumentConverter(
|
9 |
+
format_options={
|
10 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
11 |
+
}
|
12 |
+
)
|
13 |
|
14 |
def get_pdf_page_count(pdf_path):
|
15 |
reader = PdfReader(pdf_path)
|
16 |
return len(reader.pages)
|
17 |
|
18 |
+
def get_docling_ocr(pdf_path, page_num):
|
19 |
+
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
20 |
+
markdown_text_docling = result.document.export_to_markdown()
|
21 |
+
return markdown_text_docling
|
22 |
+
|
23 |
def inference(pdf_path, page_num):
|
24 |
+
docling_ocr = get_docling_ocr(pdf_path, page_num)
|
25 |
+
return docling_ocr
|
26 |
|
27 |
title = "OCR Arena"
|
28 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
|
46 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
47 |
submit_btn = gr.Button("Submit", variant='primary')
|
48 |
|
49 |
+
submit_btn.click(inference, inputs=[pdf, page_num], outputs=docling_ocr_out)
|
50 |
|
51 |
with gr.Column():
|
52 |
+
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text")
|
53 |
|
54 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
55 |
|
requirements.txt
CHANGED
@@ -1,13 +1,23 @@
|
|
1 |
aiofiles==24.1.0
|
2 |
annotated-types==0.7.0
|
3 |
anyio==4.9.0
|
|
|
|
|
4 |
certifi==2025.6.15
|
5 |
charset-normalizer==3.4.2
|
6 |
click==8.2.1
|
7 |
colorama==0.4.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
fastapi==0.115.14
|
9 |
ffmpy==0.6.0
|
10 |
filelock==3.18.0
|
|
|
11 |
fsspec==2025.5.1
|
12 |
gradio==5.35.0
|
13 |
gradio_client==1.10.4
|
@@ -17,35 +27,79 @@ httpcore==1.0.9
|
|
17 |
httpx==0.28.1
|
18 |
huggingface-hub==0.33.1
|
19 |
idna==3.10
|
|
|
20 |
Jinja2==3.1.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
markdown-it-py==3.0.0
|
|
|
22 |
MarkupSafe==3.0.2
|
23 |
mdurl==0.1.2
|
|
|
|
|
|
|
|
|
|
|
24 |
numpy==2.2.6
|
|
|
|
|
25 |
orjson==3.10.18
|
26 |
packaging==25.0
|
27 |
pandas==2.3.0
|
28 |
pillow==11.2.1
|
|
|
|
|
29 |
pydantic==2.11.7
|
|
|
30 |
pydantic_core==2.33.2
|
31 |
pydub==0.25.1
|
32 |
Pygments==2.19.2
|
|
|
33 |
PyPDF2==3.0.1
|
|
|
|
|
34 |
python-dateutil==2.9.0.post0
|
|
|
|
|
35 |
python-multipart==0.0.20
|
|
|
36 |
pytz==2025.2
|
|
|
37 |
PyYAML==6.0.2
|
|
|
|
|
38 |
requests==2.32.4
|
39 |
rich==14.0.0
|
|
|
|
|
40 |
ruff==0.12.1
|
41 |
safehttpx==0.1.6
|
|
|
|
|
|
|
42 |
semantic-version==2.10.0
|
|
|
|
|
|
|
43 |
shellingham==1.5.4
|
44 |
six==1.17.0
|
45 |
sniffio==1.3.1
|
|
|
46 |
starlette==0.46.2
|
|
|
|
|
|
|
|
|
47 |
tomlkit==0.13.3
|
|
|
|
|
48 |
tqdm==4.67.1
|
|
|
49 |
typer==0.16.0
|
50 |
typing-inspection==0.4.1
|
51 |
typing_extensions==4.14.0
|
@@ -53,3 +107,4 @@ tzdata==2025.2
|
|
53 |
urllib3==2.5.0
|
54 |
uvicorn==0.34.3
|
55 |
websockets==15.0.1
|
|
|
|
1 |
aiofiles==24.1.0
|
2 |
annotated-types==0.7.0
|
3 |
anyio==4.9.0
|
4 |
+
attrs==25.3.0
|
5 |
+
beautifulsoup4==4.13.4
|
6 |
certifi==2025.6.15
|
7 |
charset-normalizer==3.4.2
|
8 |
click==8.2.1
|
9 |
colorama==0.4.6
|
10 |
+
dill==0.4.0
|
11 |
+
docling==2.39.0
|
12 |
+
docling-core==2.39.0
|
13 |
+
docling-ibm-models==3.6.0
|
14 |
+
docling-parse==4.1.0
|
15 |
+
easyocr==1.7.2
|
16 |
+
et_xmlfile==2.0.0
|
17 |
fastapi==0.115.14
|
18 |
ffmpy==0.6.0
|
19 |
filelock==3.18.0
|
20 |
+
filetype==1.2.0
|
21 |
fsspec==2025.5.1
|
22 |
gradio==5.35.0
|
23 |
gradio_client==1.10.4
|
|
|
27 |
httpx==0.28.1
|
28 |
huggingface-hub==0.33.1
|
29 |
idna==3.10
|
30 |
+
imageio==2.37.0
|
31 |
Jinja2==3.1.6
|
32 |
+
jsonlines==3.1.0
|
33 |
+
jsonref==1.1.0
|
34 |
+
jsonschema==4.24.0
|
35 |
+
jsonschema-specifications==2025.4.1
|
36 |
+
latex2mathml==3.78.0
|
37 |
+
lazy_loader==0.4
|
38 |
+
lxml==5.4.0
|
39 |
markdown-it-py==3.0.0
|
40 |
+
marko==2.1.4
|
41 |
MarkupSafe==3.0.2
|
42 |
mdurl==0.1.2
|
43 |
+
mpire==2.10.2
|
44 |
+
mpmath==1.3.0
|
45 |
+
multiprocess==0.70.18
|
46 |
+
networkx==3.5
|
47 |
+
ninja==1.11.1.4
|
48 |
numpy==2.2.6
|
49 |
+
opencv-python-headless==4.11.0.86
|
50 |
+
openpyxl==3.1.5
|
51 |
orjson==3.10.18
|
52 |
packaging==25.0
|
53 |
pandas==2.3.0
|
54 |
pillow==11.2.1
|
55 |
+
pluggy==1.6.0
|
56 |
+
pyclipper==1.3.0.post6
|
57 |
pydantic==2.11.7
|
58 |
+
pydantic-settings==2.10.1
|
59 |
pydantic_core==2.33.2
|
60 |
pydub==0.25.1
|
61 |
Pygments==2.19.2
|
62 |
+
pylatexenc==2.10
|
63 |
PyPDF2==3.0.1
|
64 |
+
pypdfium2==4.30.1
|
65 |
+
python-bidi==0.6.6
|
66 |
python-dateutil==2.9.0.post0
|
67 |
+
python-docx==1.2.0
|
68 |
+
python-dotenv==1.1.1
|
69 |
python-multipart==0.0.20
|
70 |
+
python-pptx==1.0.2
|
71 |
pytz==2025.2
|
72 |
+
pywin32==310
|
73 |
PyYAML==6.0.2
|
74 |
+
referencing==0.36.2
|
75 |
+
regex==2024.11.6
|
76 |
requests==2.32.4
|
77 |
rich==14.0.0
|
78 |
+
rpds-py==0.25.1
|
79 |
+
rtree==1.4.0
|
80 |
ruff==0.12.1
|
81 |
safehttpx==0.1.6
|
82 |
+
safetensors==0.5.3
|
83 |
+
scikit-image==0.25.2
|
84 |
+
scipy==1.16.0
|
85 |
semantic-version==2.10.0
|
86 |
+
semchunk==2.2.2
|
87 |
+
setuptools==80.9.0
|
88 |
+
shapely==2.1.1
|
89 |
shellingham==1.5.4
|
90 |
six==1.17.0
|
91 |
sniffio==1.3.1
|
92 |
+
soupsieve==2.7
|
93 |
starlette==0.46.2
|
94 |
+
sympy==1.14.0
|
95 |
+
tabulate==0.9.0
|
96 |
+
tifffile==2025.6.11
|
97 |
+
tokenizers==0.21.2
|
98 |
tomlkit==0.13.3
|
99 |
+
torch==2.7.1
|
100 |
+
torchvision==0.22.1
|
101 |
tqdm==4.67.1
|
102 |
+
transformers==4.53.0
|
103 |
typer==0.16.0
|
104 |
typing-inspection==0.4.1
|
105 |
typing_extensions==4.14.0
|
|
|
107 |
urllib3==2.5.0
|
108 |
uvicorn==0.34.3
|
109 |
websockets==15.0.1
|
110 |
+
xlsxwriter==3.2.5
|