AkashDataScience commited on
Commit
ae5ac9c
·
1 Parent(s): adf4200

Adding Docling

Browse files
Files changed (2) hide show
  1. app.py +19 -3
  2. requirements.txt +55 -0
app.py CHANGED
@@ -1,12 +1,28 @@
1
  from PyPDF2 import PdfReader
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
3
 
4
  def get_pdf_page_count(pdf_path):
5
  reader = PdfReader(pdf_path)
6
  return len(reader.pages)
7
 
 
 
 
 
 
8
  def inference(pdf_path, page_num):
9
- return "A"
 
10
 
11
  title = "OCR Arena"
12
  description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
@@ -30,10 +46,10 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
30
  clear_btn = gr.ClearButton(components=[pdf, page_num])
31
  submit_btn = gr.Button("Submit", variant='primary')
32
 
33
- submit_btn.click(inference, inputs=[pdf, page_num], outputs=ocr_out)
34
 
35
  with gr.Column():
36
- ocr_out = gr.Textbox(label="OCR Output", type="text")
37
 
38
  examples_obj = gr.Examples(examples=examples, inputs=[pdf])
39
 
 
1
  from PyPDF2 import PdfReader
2
  import gradio as gr
3
+ from docling.document_converter import DocumentConverter, PdfFormatOption
4
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
5
+ from docling.datamodel.base_models import InputFormat
6
+
7
+ pipeline_options = PdfPipelineOptions(enable_remote_services=True)
8
+ converter = DocumentConverter(
9
+ format_options={
10
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
11
+ }
12
+ )
13
 
14
  def get_pdf_page_count(pdf_path):
15
  reader = PdfReader(pdf_path)
16
  return len(reader.pages)
17
 
18
+ def get_docling_ocr(pdf_path, page_num):
19
+ result = converter.convert(pdf_path, page_range=(page_num, page_num))
20
+ markdown_text_docling = result.document.export_to_markdown()
21
+ return markdown_text_docling
22
+
23
  def inference(pdf_path, page_num):
24
+ docling_ocr = get_docling_ocr(pdf_path, page_num)
25
+ return docling_ocr
26
 
27
  title = "OCR Arena"
28
  description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
 
46
  clear_btn = gr.ClearButton(components=[pdf, page_num])
47
  submit_btn = gr.Button("Submit", variant='primary')
48
 
49
+ submit_btn.click(inference, inputs=[pdf, page_num], outputs=docling_ocr_out)
50
 
51
  with gr.Column():
52
+ docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text")
53
 
54
  examples_obj = gr.Examples(examples=examples, inputs=[pdf])
55
 
requirements.txt CHANGED
@@ -1,13 +1,23 @@
1
  aiofiles==24.1.0
2
  annotated-types==0.7.0
3
  anyio==4.9.0
 
 
4
  certifi==2025.6.15
5
  charset-normalizer==3.4.2
6
  click==8.2.1
7
  colorama==0.4.6
 
 
 
 
 
 
 
8
  fastapi==0.115.14
9
  ffmpy==0.6.0
10
  filelock==3.18.0
 
11
  fsspec==2025.5.1
12
  gradio==5.35.0
13
  gradio_client==1.10.4
@@ -17,35 +27,79 @@ httpcore==1.0.9
17
  httpx==0.28.1
18
  huggingface-hub==0.33.1
19
  idna==3.10
 
20
  Jinja2==3.1.6
 
 
 
 
 
 
 
21
  markdown-it-py==3.0.0
 
22
  MarkupSafe==3.0.2
23
  mdurl==0.1.2
 
 
 
 
 
24
  numpy==2.2.6
 
 
25
  orjson==3.10.18
26
  packaging==25.0
27
  pandas==2.3.0
28
  pillow==11.2.1
 
 
29
  pydantic==2.11.7
 
30
  pydantic_core==2.33.2
31
  pydub==0.25.1
32
  Pygments==2.19.2
 
33
  PyPDF2==3.0.1
 
 
34
  python-dateutil==2.9.0.post0
 
 
35
  python-multipart==0.0.20
 
36
  pytz==2025.2
 
37
  PyYAML==6.0.2
 
 
38
  requests==2.32.4
39
  rich==14.0.0
 
 
40
  ruff==0.12.1
41
  safehttpx==0.1.6
 
 
 
42
  semantic-version==2.10.0
 
 
 
43
  shellingham==1.5.4
44
  six==1.17.0
45
  sniffio==1.3.1
 
46
  starlette==0.46.2
 
 
 
 
47
  tomlkit==0.13.3
 
 
48
  tqdm==4.67.1
 
49
  typer==0.16.0
50
  typing-inspection==0.4.1
51
  typing_extensions==4.14.0
@@ -53,3 +107,4 @@ tzdata==2025.2
53
  urllib3==2.5.0
54
  uvicorn==0.34.3
55
  websockets==15.0.1
 
 
1
  aiofiles==24.1.0
2
  annotated-types==0.7.0
3
  anyio==4.9.0
4
+ attrs==25.3.0
5
+ beautifulsoup4==4.13.4
6
  certifi==2025.6.15
7
  charset-normalizer==3.4.2
8
  click==8.2.1
9
  colorama==0.4.6
10
+ dill==0.4.0
11
+ docling==2.39.0
12
+ docling-core==2.39.0
13
+ docling-ibm-models==3.6.0
14
+ docling-parse==4.1.0
15
+ easyocr==1.7.2
16
+ et_xmlfile==2.0.0
17
  fastapi==0.115.14
18
  ffmpy==0.6.0
19
  filelock==3.18.0
20
+ filetype==1.2.0
21
  fsspec==2025.5.1
22
  gradio==5.35.0
23
  gradio_client==1.10.4
 
27
  httpx==0.28.1
28
  huggingface-hub==0.33.1
29
  idna==3.10
30
+ imageio==2.37.0
31
  Jinja2==3.1.6
32
+ jsonlines==3.1.0
33
+ jsonref==1.1.0
34
+ jsonschema==4.24.0
35
+ jsonschema-specifications==2025.4.1
36
+ latex2mathml==3.78.0
37
+ lazy_loader==0.4
38
+ lxml==5.4.0
39
  markdown-it-py==3.0.0
40
+ marko==2.1.4
41
  MarkupSafe==3.0.2
42
  mdurl==0.1.2
43
+ mpire==2.10.2
44
+ mpmath==1.3.0
45
+ multiprocess==0.70.18
46
+ networkx==3.5
47
+ ninja==1.11.1.4
48
  numpy==2.2.6
49
+ opencv-python-headless==4.11.0.86
50
+ openpyxl==3.1.5
51
  orjson==3.10.18
52
  packaging==25.0
53
  pandas==2.3.0
54
  pillow==11.2.1
55
+ pluggy==1.6.0
56
+ pyclipper==1.3.0.post6
57
  pydantic==2.11.7
58
+ pydantic-settings==2.10.1
59
  pydantic_core==2.33.2
60
  pydub==0.25.1
61
  Pygments==2.19.2
62
+ pylatexenc==2.10
63
  PyPDF2==3.0.1
64
+ pypdfium2==4.30.1
65
+ python-bidi==0.6.6
66
  python-dateutil==2.9.0.post0
67
+ python-docx==1.2.0
68
+ python-dotenv==1.1.1
69
  python-multipart==0.0.20
70
+ python-pptx==1.0.2
71
  pytz==2025.2
72
+ pywin32==310
73
  PyYAML==6.0.2
74
+ referencing==0.36.2
75
+ regex==2024.11.6
76
  requests==2.32.4
77
  rich==14.0.0
78
+ rpds-py==0.25.1
79
+ rtree==1.4.0
80
  ruff==0.12.1
81
  safehttpx==0.1.6
82
+ safetensors==0.5.3
83
+ scikit-image==0.25.2
84
+ scipy==1.16.0
85
  semantic-version==2.10.0
86
+ semchunk==2.2.2
87
+ setuptools==80.9.0
88
+ shapely==2.1.1
89
  shellingham==1.5.4
90
  six==1.17.0
91
  sniffio==1.3.1
92
+ soupsieve==2.7
93
  starlette==0.46.2
94
+ sympy==1.14.0
95
+ tabulate==0.9.0
96
+ tifffile==2025.6.11
97
+ tokenizers==0.21.2
98
  tomlkit==0.13.3
99
+ torch==2.7.1
100
+ torchvision==0.22.1
101
  tqdm==4.67.1
102
+ transformers==4.53.0
103
  typer==0.16.0
104
  typing-inspection==0.4.1
105
  typing_extensions==4.14.0
 
107
  urllib3==2.5.0
108
  uvicorn==0.34.3
109
  websockets==15.0.1
110
+ xlsxwriter==3.2.5