Michele Dolfi commited on
Commit
84568d7
·
unverified ·
1 Parent(s): 3d47a36

feat: upgrade endpoint to docling v2 (#13)

Browse files

* upgrade endpoint to docling v2

Signed-off-by: Michele Dolfi <[email protected]>

* fix Containerfile

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>

Containerfile CHANGED
@@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
20
  ENV HF_HOME=/tmp/
21
  ENV TORCH_HOME=/tmp/
22
 
23
- RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
24
 
25
  # On container environments, always set a thread budget to avoid undesired thread congestion.
26
  ENV OMP_NUM_THREADS=4
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve
29
 
30
  EXPOSE 5000
31
 
32
- CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
 
20
  ENV HF_HOME=/tmp/
21
  ENV TORCH_HOME=/tmp/
22
 
23
+ RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
24
 
25
  # On container environments, always set a thread budget to avoid undesired thread congestion.
26
  ENV OMP_NUM_THREADS=4
 
29
 
30
  EXPOSE 5000
31
 
32
+ CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
docling_serve/app.py CHANGED
@@ -1,21 +1,55 @@
1
  import base64
 
2
  from contextlib import asynccontextmanager
 
3
  from io import BytesIO
4
- from pathlib import Path
5
- from typing import Any, Dict, Union
6
 
7
  import httpx
8
  from docling.datamodel.base_models import (
9
  ConversionStatus,
10
  DocumentStream,
11
- PipelineOptions,
 
12
  )
13
- from docling.datamodel.document import ConversionResult, DocumentConversionInput
14
- from docling.document_converter import DocumentConverter
15
- from fastapi import FastAPI, HTTPException
16
- from pydantic import BaseModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- from docling_serve.settings import Settings
 
19
 
20
 
21
  class HttpSource(BaseModel):
@@ -28,16 +62,30 @@ class FileSource(BaseModel):
28
  filename: str
29
 
30
 
31
- class ConvertDocumentHttpSourceRequest(BaseModel):
32
  http_source: HttpSource
33
 
34
 
35
- class ConvertDocumentFileSourceRequest(BaseModel):
36
  file_source: FileSource
37
 
38
 
 
 
 
 
 
 
39
  class ConvertDocumentResponse(BaseModel):
40
- content_md: str
 
 
 
 
 
 
 
 
41
 
42
 
43
  ConvertDocumentRequest = Union[
@@ -45,20 +93,93 @@ ConvertDocumentRequest = Union[
45
  ]
46
 
47
 
48
- models = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
  @asynccontextmanager
52
  async def lifespan(app: FastAPI):
53
- # Converter
54
- settings = Settings()
55
- pipeline_options = PipelineOptions()
56
- pipeline_options.do_ocr = settings.do_ocr
57
- pipeline_options.do_table_structure = settings.do_table_structure
58
- models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
 
 
 
 
 
 
 
59
  yield
60
 
61
- models.clear()
62
 
63
 
64
  app = FastAPI(
@@ -67,10 +188,14 @@ app = FastAPI(
67
  )
68
 
69
 
70
- @app.post("/convert")
71
- def convert_pdf_document(
 
 
 
 
72
  body: ConvertDocumentRequest,
73
- ) -> ConvertDocumentResponse:
74
 
75
  filename: str
76
  buf: BytesIO
@@ -81,16 +206,74 @@ def convert_pdf_document(
81
  elif isinstance(body, ConvertDocumentHttpSourceRequest):
82
  http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
83
  buf = BytesIO(http_res.content)
84
- filename = Path(
85
- body.http_source.url
86
- ).name # TODO: use better way to detect filename, e.g. from Content-Disposition
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- docs_input = DocumentConversionInput.from_streams(
89
- [DocumentStream(filename=filename, stream=buf)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
- result: ConversionResult = next(models["converter"].convert(docs_input), None)
92
 
93
- if result is None or result.status != ConversionStatus.SUCCESS:
94
- raise HTTPException(status_code=500, detail={"errors": result.errors})
95
 
96
- return ConvertDocumentResponse(content_md=result.render_as_markdown())
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import base64
2
+ import hashlib
3
  from contextlib import asynccontextmanager
4
+ from enum import Enum
5
  from io import BytesIO
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
 
7
 
8
  import httpx
9
  from docling.datamodel.base_models import (
10
  ConversionStatus,
11
  DocumentStream,
12
+ ErrorItem,
13
+ InputFormat,
14
  )
15
+ from docling.datamodel.document import ConversionResult
16
+ from docling.datamodel.pipeline_options import (
17
+ EasyOcrOptions,
18
+ OcrOptions,
19
+ PdfPipelineOptions,
20
+ RapidOcrOptions,
21
+ TesseractOcrOptions,
22
+ )
23
+ from docling.document_converter import DocumentConverter, PdfFormatOption
24
+ from docling.utils.profiling import ProfilingItem
25
+ from docling_core.types.doc import DoclingDocument, ImageRefMode
26
+ from docling_core.utils.file import resolve_remote_filename
27
+ from fastapi import FastAPI, HTTPException, Response
28
+ from pydantic import AnyHttpUrl, BaseModel
29
+
30
+
31
+ # TODO: import enum from Docling, once it is exposed
32
+ class OcrEngine(str, Enum):
33
+ EASYOCR = "easyocr"
34
+ TESSERACT = "tesseract"
35
+ RAPIDOCR = "rapidocr"
36
+
37
+
38
+ class ConvertOptions(BaseModel):
39
+ output_docling_document: bool = True
40
+ output_markdown: bool = False
41
+ output_html: bool = False
42
+ do_ocr: bool = True
43
+ ocr_engine: OcrEngine = OcrEngine.EASYOCR
44
+ ocr_lang: Optional[List[str]] = None
45
+ force_ocr: bool = False
46
+ do_table_structure: bool = True
47
+ include_images: bool = True
48
+ images_scale: float = 2.0
49
+
50
 
51
+ class DocumentConvertBase(BaseModel):
52
+ options: ConvertOptions = ConvertOptions()
53
 
54
 
55
  class HttpSource(BaseModel):
 
62
  filename: str
63
 
64
 
65
+ class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
66
  http_source: HttpSource
67
 
68
 
69
+ class ConvertDocumentFileSourceRequest(DocumentConvertBase):
70
  file_source: FileSource
71
 
72
 
73
+ class DocumentResponse(BaseModel):
74
+ markdown: Optional[str] = None
75
+ docling_document: Optional[DoclingDocument] = None
76
+ html: Optional[str] = None
77
+
78
+
79
  class ConvertDocumentResponse(BaseModel):
80
+ document: DocumentResponse
81
+ status: ConversionStatus
82
+ errors: List[ErrorItem] = []
83
+ timings: Dict[str, ProfilingItem] = {}
84
+
85
+
86
+ class ConvertDocumentErrorResponse(BaseModel):
87
+ status: ConversionStatus
88
+ # errors: List[ErrorItem] = []
89
 
90
 
91
  ConvertDocumentRequest = Union[
 
93
  ]
94
 
95
 
96
+ class MarkdownTextResponse(Response):
97
+ media_type = "text/markdown"
98
+
99
+
100
+ class HealthCheckResponse(BaseModel):
101
+ status: str = "ok"
102
+
103
+
104
+ def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
105
+
106
+ if options.ocr_engine == OcrEngine.EASYOCR:
107
+ try:
108
+ import easyocr # noqa: F401
109
+ except ImportError:
110
+ raise HTTPException(
111
+ status_code=400,
112
+ detail="The requested OCR engine"
113
+ f" (ocr_engine={options.ocr_engine.value})"
114
+ " is not available on this system. Please choose another OCR engine "
115
+ "or contact your system administrator.",
116
+ )
117
+ ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
118
+ elif options.ocr_engine == OcrEngine.TESSERACT:
119
+ try:
120
+ import tesserocr # noqa: F401
121
+ except ImportError:
122
+ raise HTTPException(
123
+ status_code=400,
124
+ detail="The requested OCR engine"
125
+ f" (ocr_engine={options.ocr_engine.value})"
126
+ " is not available on this system. Please choose another OCR engine "
127
+ "or contact your system administrator.",
128
+ )
129
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
130
+ elif options.ocr_engine == OcrEngine.RAPIDOCR:
131
+ try:
132
+ from rapidocr_onnxruntime import RapidOCR # noqa: F401
133
+ except ImportError:
134
+ raise HTTPException(
135
+ status_code=400,
136
+ detail="The requested OCR engine"
137
+ f" (ocr_engine={options.ocr_engine.value})"
138
+ " is not available on this system. Please choose another OCR engine "
139
+ "or contact your system administrator.",
140
+ )
141
+ ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
142
+ else:
143
+ raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
144
+
145
+ if options.ocr_lang is not None:
146
+ ocr_options.lang = options.ocr_lang
147
+
148
+ pipeline_options = PdfPipelineOptions(
149
+ do_ocr=options.do_ocr,
150
+ ocr_options=ocr_options,
151
+ do_table_structure=options.do_table_structure,
152
+ generate_page_images=options.include_images,
153
+ generate_picture_images=options.include_images,
154
+ images_scale=options.images_scale,
155
+ )
156
+
157
+ options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
158
+
159
+ return pipeline_options, options_hash
160
+
161
+
162
+ converters: Dict[str, DocumentConverter] = {}
163
 
164
 
165
  @asynccontextmanager
166
  async def lifespan(app: FastAPI):
167
+ # settings = Settings()
168
+
169
+ # Converter with default options
170
+ pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
171
+ converters[options_hash] = DocumentConverter(
172
+ format_options={
173
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
174
+ InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
175
+ }
176
+ )
177
+
178
+ converters[options_hash].initialize_pipeline(InputFormat.PDF)
179
+
180
  yield
181
 
182
+ converters.clear()
183
 
184
 
185
  app = FastAPI(
 
188
  )
189
 
190
 
191
+ @app.get("/health")
192
+ def health() -> HealthCheckResponse:
193
+ return HealthCheckResponse()
194
+
195
+
196
+ def _convert_document(
197
  body: ConvertDocumentRequest,
198
+ ) -> ConversionResult:
199
 
200
  filename: str
201
  buf: BytesIO
 
206
  elif isinstance(body, ConvertDocumentHttpSourceRequest):
207
  http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
208
  buf = BytesIO(http_res.content)
209
+ filename = resolve_remote_filename(
210
+ http_url=AnyHttpUrl(body.http_source.url),
211
+ response_headers=dict(**http_res.headers),
212
+ )
213
+
214
+ doc_input = DocumentStream(name=filename, stream=buf)
215
+
216
+ pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
217
+ if options_hash not in converters:
218
+ converters[options_hash] = DocumentConverter(
219
+ format_options={
220
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
221
+ InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
222
+ }
223
+ )
224
+
225
+ result: ConversionResult = converters[options_hash].convert(doc_input)
226
+
227
+ if result is None or result.status == ConversionStatus.SKIPPED:
228
+ raise HTTPException(status_code=400, detail=result.errors)
229
+
230
+ if result is None or result.status not in {
231
+ ConversionStatus.SUCCESS,
232
+ }:
233
+ raise HTTPException(
234
+ status_code=500, detail={"errors": result.errors, "status": result.status}
235
+ )
236
+
237
+ return result
238
 
239
+
240
+ @app.post(
241
+ "/convert",
242
+ )
243
+ def convert_document(
244
+ body: ConvertDocumentRequest,
245
+ ) -> ConvertDocumentResponse:
246
+
247
+ result = _convert_document(body=body)
248
+
249
+ image_mode = (
250
+ ImageRefMode.EMBEDDED
251
+ if body.options.include_images
252
+ else ImageRefMode.PLACEHOLDER
253
+ )
254
+ doc_resp = DocumentResponse()
255
+ if body.options.output_docling_document:
256
+ doc_resp.docling_document = result.document
257
+ if body.options.output_markdown:
258
+ doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
259
+ if body.options.output_html:
260
+ doc_resp.html = result.document.export_to_html(image_mode=image_mode)
261
+
262
+ return ConvertDocumentResponse(
263
+ document=doc_resp, status=result.status, timings=result.timings
264
  )
 
265
 
 
 
266
 
267
+ @app.post("/convert/markdown", response_class=MarkdownTextResponse)
268
+ def convert_document_md(
269
+ body: ConvertDocumentRequest,
270
+ ) -> MarkdownTextResponse:
271
+ result = _convert_document(body=body)
272
+ image_mode = (
273
+ ImageRefMode.EMBEDDED
274
+ if body.options.include_images
275
+ else ImageRefMode.PLACEHOLDER
276
+ )
277
+ return MarkdownTextResponse(
278
+ result.document.export_to_markdown(image_mode=image_mode)
279
+ )
docling_serve/settings.py CHANGED
@@ -2,7 +2,5 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
2
 
3
 
4
  class Settings(BaseSettings):
5
- do_ocr: bool = True
6
- do_table_structure: bool = True
7
 
8
  model_config = SettingsConfigDict(env_prefix="DOCLING_")
 
2
 
3
 
4
  class Settings(BaseSettings):
 
 
5
 
6
  model_config = SettingsConfigDict(env_prefix="DOCLING_")
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -30,12 +30,26 @@ classifiers = [
30
  ]
31
 
32
  [tool.poetry.dependencies]
33
- python = "^3.10"
34
- docling = "^1.11.0"
35
- fastapi = {version = "^0.110.2", extras = ["standard"]}
36
- uvicorn = "^0.30.6"
37
  pydantic-settings = "^2.4.0"
38
- httpx = "^0.27.2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  [tool.poetry.group.pypi-torch]
41
  optional = false
@@ -63,6 +77,12 @@ torchvision = [
63
  {markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
64
  ]
65
 
 
 
 
 
 
 
66
  [tool.poetry.group.dev.dependencies]
67
  black = "^24.8.0"
68
  isort = "^5.13.2"
@@ -93,8 +113,17 @@ remove-unused-variables = true
93
  expand-star-imports = true
94
  recursive = true
95
 
 
 
 
 
 
 
 
96
  [[tool.mypy.overrides]]
97
  module = [
98
- "docling.*",
 
 
99
  ]
100
  ignore_missing_imports = true
 
30
  ]
31
 
32
  [tool.poetry.dependencies]
33
+ python = "^3.9"
34
+ docling = "^2.10.0"
35
+ fastapi = {version = "^0.115.6", extras = ["standard"]}
36
+ uvicorn = "^0.32.1"
37
  pydantic-settings = "^2.4.0"
38
+ httpx = "^0.28.1"
39
+ tesserocr = { version = "^2.7.1", optional = true }
40
+ rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
41
+ onnxruntime = [
42
+ # 1.19.2 is the last version with python3.9 support,
43
+ # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
44
+ { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
45
+ { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
46
+ ]
47
+
48
+
49
+ [tool.poetry.extras]
50
+ tesserocr = ["tesserocr"]
51
+ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
52
+
53
 
54
  [tool.poetry.group.pypi-torch]
55
  optional = false
 
77
  {markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
78
  ]
79
 
80
+ [tool.poetry.group.constraints.dependencies]
81
+ numpy = [
82
+ { version = "^2.1.0", markers = 'python_version >= "3.13"' },
83
+ { version = "^1.24.4", markers = 'python_version < "3.13"' },
84
+ ]
85
+
86
  [tool.poetry.group.dev.dependencies]
87
  black = "^24.8.0"
88
  isort = "^5.13.2"
 
113
  expand-star-imports = true
114
  recursive = true
115
 
116
+ [tool.mypy]
117
+ pretty = true
118
+ # strict = true
119
+ no_implicit_optional = true
120
+ plugins = "pydantic.mypy"
121
+ python_version = "3.10"
122
+
123
  [[tool.mypy.overrides]]
124
  module = [
125
+ "easyocr.*",
126
+ "tesserocr.*",
127
+ "rapidocr_onnxruntime.*",
128
  ]
129
  ignore_missing_imports = true