Spaces:
Configuration error
Configuration error
Michele Dolfi
commited on
feat: Async api (#60)
Browse filesSigned-off-by: Michele Dolfi <[email protected]>
- .pre-commit-config.yaml +9 -9
- Containerfile +1 -1
- docling_serve/app.py +170 -12
- docling_serve/datamodel/__init__.py +0 -0
- docling_serve/datamodel/convert.py +174 -0
- docling_serve/datamodel/engines.py +30 -0
- docling_serve/datamodel/requests.py +62 -0
- docling_serve/datamodel/responses.py +52 -0
- docling_serve/docling_conversion.py +5 -237
- docling_serve/engines/__init__.py +8 -0
- docling_serve/engines/async_local/__init__.py +0 -0
- docling_serve/engines/async_local/orchestrator.py +101 -0
- docling_serve/engines/async_local/worker.py +116 -0
- docling_serve/engines/base_orchestrator.py +21 -0
- docling_serve/engines/block_local/__init__.py +0 -0
- docling_serve/gradio_ui.py +2 -7
- docling_serve/response_preparation.py +7 -31
- docling_serve/settings.py +5 -0
- pyproject.toml +13 -6
- tests/test_1-file-all-outputs.py +1 -1
- tests/test_1-url-all-outputs.py +1 -1
- tests/test_1-url-async-ws.py +48 -0
- tests/test_1-url-async.py +60 -0
- tests/test_2-files-all-outputs.py +9 -9
- tests/test_2-urls-all-outputs.py +9 -9
- uv.lock +15 -44
.pre-commit-config.yaml
CHANGED
@@ -1,5 +1,14 @@
|
|
1 |
fail_fast: true
|
2 |
repos:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
- repo: local
|
4 |
hooks:
|
5 |
- id: system
|
@@ -13,12 +22,3 @@ repos:
|
|
13 |
rev: 0.6.1
|
14 |
hooks:
|
15 |
- id: uv-lock
|
16 |
-
- repo: https://github.com/astral-sh/ruff-pre-commit
|
17 |
-
rev: v0.9.6
|
18 |
-
hooks:
|
19 |
-
# Run the Ruff linter.
|
20 |
-
- id: ruff
|
21 |
-
args: [--exit-non-zero-on-fix, --config=pyproject.toml]
|
22 |
-
# Run the Ruff formatter.
|
23 |
-
# - id: ruff-format
|
24 |
-
# args: [--config=pyproject.toml]
|
|
|
1 |
fail_fast: true
|
2 |
repos:
|
3 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
4 |
+
rev: v0.9.6
|
5 |
+
hooks:
|
6 |
+
# Run the Ruff formatter.
|
7 |
+
- id: ruff-format
|
8 |
+
args: [--config=pyproject.toml]
|
9 |
+
# Run the Ruff linter.
|
10 |
+
- id: ruff
|
11 |
+
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
12 |
- repo: local
|
13 |
hooks:
|
14 |
- id: system
|
|
|
22 |
rev: 0.6.1
|
23 |
hooks:
|
24 |
- id: uv-lock
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Containerfile
CHANGED
@@ -53,7 +53,7 @@ RUN echo "Downloading models..." && \
|
|
53 |
chown -R 1001:0 /opt/app-root/src/.cache && \
|
54 |
chmod -R g=u /opt/app-root/src/.cache
|
55 |
|
56 |
-
COPY --chown=1001:0
|
57 |
RUN --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
58 |
uv sync --frozen --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS} # --no-extra ${NO_EXTRA}
|
59 |
|
|
|
53 |
chown -R 1001:0 /opt/app-root/src/.cache && \
|
54 |
chmod -R g=u /opt/app-root/src/.cache
|
55 |
|
56 |
+
COPY --chown=1001:0 ./docling_serve ./docling_serve
|
57 |
RUN --mount=type=cache,target=/opt/app-root/src/.cache/uv,uid=1001 \
|
58 |
uv sync --frozen --no-dev --all-extras ${UV_SYNC_EXTRA_ARGS} # --no-extra ${NO_EXTRA}
|
59 |
|
docling_serve/app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import importlib.metadata
|
2 |
import logging
|
3 |
import tempfile
|
@@ -6,23 +7,46 @@ from io import BytesIO
|
|
6 |
from pathlib import Path
|
7 |
from typing import Annotated, Any, Dict, List, Optional, Union
|
8 |
|
9 |
-
from
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
from fastapi.middleware.cors import CORSMiddleware
|
13 |
from fastapi.responses import RedirectResponse
|
14 |
-
from pydantic import BaseModel
|
15 |
|
16 |
-
from
|
|
|
|
|
|
|
|
|
17 |
ConvertDocumentFileSourcesRequest,
|
18 |
-
ConvertDocumentsOptions,
|
19 |
ConvertDocumentsRequest,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
convert_documents,
|
21 |
converters,
|
22 |
get_pdf_pipeline_opts,
|
23 |
)
|
|
|
|
|
|
|
|
|
|
|
24 |
from docling_serve.helper_functions import FormDepends
|
25 |
-
from docling_serve.response_preparation import
|
26 |
from docling_serve.settings import docling_serve_settings
|
27 |
|
28 |
|
@@ -72,9 +96,22 @@ async def lifespan(app: FastAPI):
|
|
72 |
|
73 |
converters[options_hash].initialize_pipeline(InputFormat.PDF)
|
74 |
|
|
|
|
|
|
|
|
|
|
|
75 |
yield
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
converters.clear()
|
|
|
78 |
# if WITH_UI:
|
79 |
# gradio_ui.close()
|
80 |
|
@@ -84,7 +121,7 @@ async def lifespan(app: FastAPI):
|
|
84 |
##################################
|
85 |
|
86 |
|
87 |
-
def create_app():
|
88 |
try:
|
89 |
version = importlib.metadata.version("docling_serve")
|
90 |
except importlib.metadata.PackageNotFoundError:
|
@@ -145,10 +182,6 @@ def create_app():
|
|
145 |
)
|
146 |
return response
|
147 |
|
148 |
-
# Status
|
149 |
-
class HealthCheckResponse(BaseModel):
|
150 |
-
status: str = "ok"
|
151 |
-
|
152 |
@app.get("/health")
|
153 |
def health() -> HealthCheckResponse:
|
154 |
return HealthCheckResponse()
|
@@ -233,4 +266,129 @@ def create_app():
|
|
233 |
|
234 |
return response
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
return app
|
|
|
1 |
+
import asyncio
|
2 |
import importlib.metadata
|
3 |
import logging
|
4 |
import tempfile
|
|
|
7 |
from pathlib import Path
|
8 |
from typing import Annotated, Any, Dict, List, Optional, Union
|
9 |
|
10 |
+
from fastapi import (
|
11 |
+
BackgroundTasks,
|
12 |
+
Depends,
|
13 |
+
FastAPI,
|
14 |
+
HTTPException,
|
15 |
+
Query,
|
16 |
+
UploadFile,
|
17 |
+
WebSocket,
|
18 |
+
WebSocketDisconnect,
|
19 |
+
)
|
20 |
from fastapi.middleware.cors import CORSMiddleware
|
21 |
from fastapi.responses import RedirectResponse
|
|
|
22 |
|
23 |
+
from docling.datamodel.base_models import DocumentStream, InputFormat
|
24 |
+
from docling.document_converter import DocumentConverter
|
25 |
+
|
26 |
+
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
27 |
+
from docling_serve.datamodel.requests import (
|
28 |
ConvertDocumentFileSourcesRequest,
|
|
|
29 |
ConvertDocumentsRequest,
|
30 |
+
)
|
31 |
+
from docling_serve.datamodel.responses import (
|
32 |
+
ConvertDocumentResponse,
|
33 |
+
HealthCheckResponse,
|
34 |
+
MessageKind,
|
35 |
+
TaskStatusResponse,
|
36 |
+
WebsocketMessage,
|
37 |
+
)
|
38 |
+
from docling_serve.docling_conversion import (
|
39 |
convert_documents,
|
40 |
converters,
|
41 |
get_pdf_pipeline_opts,
|
42 |
)
|
43 |
+
from docling_serve.engines import get_orchestrator
|
44 |
+
from docling_serve.engines.async_local.orchestrator import (
|
45 |
+
AsyncLocalOrchestrator,
|
46 |
+
TaskNotFoundError,
|
47 |
+
)
|
48 |
from docling_serve.helper_functions import FormDepends
|
49 |
+
from docling_serve.response_preparation import process_results
|
50 |
from docling_serve.settings import docling_serve_settings
|
51 |
|
52 |
|
|
|
96 |
|
97 |
converters[options_hash].initialize_pipeline(InputFormat.PDF)
|
98 |
|
99 |
+
orchestrator = get_orchestrator()
|
100 |
+
|
101 |
+
# Start the background queue processor
|
102 |
+
queue_task = asyncio.create_task(orchestrator.process_queue())
|
103 |
+
|
104 |
yield
|
105 |
|
106 |
+
# Cancel the background queue processor on shutdown
|
107 |
+
queue_task.cancel()
|
108 |
+
try:
|
109 |
+
await queue_task
|
110 |
+
except asyncio.CancelledError:
|
111 |
+
_log.info("Queue processor cancelled.")
|
112 |
+
|
113 |
converters.clear()
|
114 |
+
|
115 |
# if WITH_UI:
|
116 |
# gradio_ui.close()
|
117 |
|
|
|
121 |
##################################
|
122 |
|
123 |
|
124 |
+
def create_app(): # noqa: C901
|
125 |
try:
|
126 |
version = importlib.metadata.version("docling_serve")
|
127 |
except importlib.metadata.PackageNotFoundError:
|
|
|
182 |
)
|
183 |
return response
|
184 |
|
|
|
|
|
|
|
|
|
185 |
@app.get("/health")
|
186 |
def health() -> HealthCheckResponse:
|
187 |
return HealthCheckResponse()
|
|
|
266 |
|
267 |
return response
|
268 |
|
269 |
+
# Convert a document from URL(s) using the async api
|
270 |
+
@app.post(
|
271 |
+
"/v1alpha/convert/source/async",
|
272 |
+
response_model=TaskStatusResponse,
|
273 |
+
)
|
274 |
+
async def process_url_async(
|
275 |
+
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
276 |
+
conversion_request: ConvertDocumentsRequest,
|
277 |
+
):
|
278 |
+
task = await orchestrator.enqueue(request=conversion_request)
|
279 |
+
task_queue_position = await orchestrator.get_queue_position(
|
280 |
+
task_id=task.task_id
|
281 |
+
)
|
282 |
+
return TaskStatusResponse(
|
283 |
+
task_id=task.task_id,
|
284 |
+
task_status=task.task_status,
|
285 |
+
task_position=task_queue_position,
|
286 |
+
)
|
287 |
+
|
288 |
+
# Task status poll
|
289 |
+
@app.get(
|
290 |
+
"/v1alpha/status/poll/{task_id}",
|
291 |
+
response_model=TaskStatusResponse,
|
292 |
+
)
|
293 |
+
async def task_status_poll(
|
294 |
+
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
295 |
+
task_id: str,
|
296 |
+
wait: Annotated[
|
297 |
+
float, Query(help="Number of seconds to wait for a completed status.")
|
298 |
+
] = 0.0,
|
299 |
+
):
|
300 |
+
try:
|
301 |
+
task = await orchestrator.task_status(task_id=task_id, wait=wait)
|
302 |
+
task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
|
303 |
+
except TaskNotFoundError:
|
304 |
+
raise HTTPException(status_code=404, detail="Task not found.")
|
305 |
+
return TaskStatusResponse(
|
306 |
+
task_id=task.task_id,
|
307 |
+
task_status=task.task_status,
|
308 |
+
task_position=task_queue_position,
|
309 |
+
)
|
310 |
+
|
311 |
+
# Task status websocket
|
312 |
+
@app.websocket(
|
313 |
+
"/v1alpha/status/ws/{task_id}",
|
314 |
+
)
|
315 |
+
async def task_status_ws(
|
316 |
+
websocket: WebSocket,
|
317 |
+
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
318 |
+
task_id: str,
|
319 |
+
):
|
320 |
+
await websocket.accept()
|
321 |
+
|
322 |
+
if task_id not in orchestrator.tasks:
|
323 |
+
await websocket.send_text(
|
324 |
+
WebsocketMessage(
|
325 |
+
message=MessageKind.ERROR, error="Task not found."
|
326 |
+
).model_dump_json()
|
327 |
+
)
|
328 |
+
await websocket.close()
|
329 |
+
return
|
330 |
+
|
331 |
+
task = orchestrator.tasks[task_id]
|
332 |
+
|
333 |
+
# Track active WebSocket connections for this job
|
334 |
+
orchestrator.task_subscribers[task_id].add(websocket)
|
335 |
+
|
336 |
+
try:
|
337 |
+
task_queue_position = await orchestrator.get_queue_position(task_id=task_id)
|
338 |
+
task_response = TaskStatusResponse(
|
339 |
+
task_id=task.task_id,
|
340 |
+
task_status=task.task_status,
|
341 |
+
task_position=task_queue_position,
|
342 |
+
)
|
343 |
+
await websocket.send_text(
|
344 |
+
WebsocketMessage(
|
345 |
+
message=MessageKind.CONNECTION, task=task_response
|
346 |
+
).model_dump_json()
|
347 |
+
)
|
348 |
+
while True:
|
349 |
+
task_queue_position = await orchestrator.get_queue_position(
|
350 |
+
task_id=task_id
|
351 |
+
)
|
352 |
+
task_response = TaskStatusResponse(
|
353 |
+
task_id=task.task_id,
|
354 |
+
task_status=task.task_status,
|
355 |
+
task_position=task_queue_position,
|
356 |
+
)
|
357 |
+
await websocket.send_text(
|
358 |
+
WebsocketMessage(
|
359 |
+
message=MessageKind.UPDATE, task=task_response
|
360 |
+
).model_dump_json()
|
361 |
+
)
|
362 |
+
# each client message will be interpreted as a request for update
|
363 |
+
msg = await websocket.receive_text()
|
364 |
+
_log.debug(f"Received message: {msg}")
|
365 |
+
|
366 |
+
except WebSocketDisconnect:
|
367 |
+
_log.info(f"WebSocket disconnected for job {task_id}")
|
368 |
+
|
369 |
+
finally:
|
370 |
+
orchestrator.task_subscribers[task_id].remove(websocket)
|
371 |
+
|
372 |
+
# Task result
|
373 |
+
@app.get(
|
374 |
+
"/v1alpha/result/{task_id}",
|
375 |
+
response_model=ConvertDocumentResponse,
|
376 |
+
responses={
|
377 |
+
200: {
|
378 |
+
"content": {"application/zip": {}},
|
379 |
+
}
|
380 |
+
},
|
381 |
+
)
|
382 |
+
async def task_result(
|
383 |
+
orchestrator: Annotated[AsyncLocalOrchestrator, Depends(get_orchestrator)],
|
384 |
+
task_id: str,
|
385 |
+
):
|
386 |
+
result = await orchestrator.task_result(task_id=task_id)
|
387 |
+
if result is None:
|
388 |
+
raise HTTPException(
|
389 |
+
status_code=404,
|
390 |
+
detail="Task result not found. Please wait for a completion status.",
|
391 |
+
)
|
392 |
+
return result
|
393 |
+
|
394 |
return app
|
docling_serve/datamodel/__init__.py
ADDED
File without changes
|
docling_serve/datamodel/convert.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Define the input options for the API
|
2 |
+
from typing import Annotated, List, Optional
|
3 |
+
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
|
6 |
+
from docling.datamodel.base_models import InputFormat, OutputFormat
|
7 |
+
from docling.datamodel.pipeline_options import OcrEngine, PdfBackend, TableFormerMode
|
8 |
+
from docling_core.types.doc import ImageRefMode
|
9 |
+
|
10 |
+
|
11 |
+
class ConvertDocumentsOptions(BaseModel):
|
12 |
+
from_formats: Annotated[
|
13 |
+
List[InputFormat],
|
14 |
+
Field(
|
15 |
+
description=(
|
16 |
+
"Input format(s) to convert from. String or list of strings. "
|
17 |
+
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
18 |
+
"Optional, defaults to all formats."
|
19 |
+
),
|
20 |
+
examples=[[v.value for v in InputFormat]],
|
21 |
+
),
|
22 |
+
] = list(InputFormat)
|
23 |
+
|
24 |
+
to_formats: Annotated[
|
25 |
+
List[OutputFormat],
|
26 |
+
Field(
|
27 |
+
description=(
|
28 |
+
"Output format(s) to convert to. String or list of strings. "
|
29 |
+
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
30 |
+
"Optional, defaults to Markdown."
|
31 |
+
),
|
32 |
+
examples=[[OutputFormat.MARKDOWN]],
|
33 |
+
),
|
34 |
+
] = [OutputFormat.MARKDOWN]
|
35 |
+
|
36 |
+
image_export_mode: Annotated[
|
37 |
+
ImageRefMode,
|
38 |
+
Field(
|
39 |
+
description=(
|
40 |
+
"Image export mode for the document (in case of JSON,"
|
41 |
+
" Markdown or HTML). "
|
42 |
+
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
43 |
+
"Optional, defaults to Embedded."
|
44 |
+
),
|
45 |
+
examples=[ImageRefMode.EMBEDDED.value],
|
46 |
+
# pattern="embedded|placeholder|referenced",
|
47 |
+
),
|
48 |
+
] = ImageRefMode.EMBEDDED
|
49 |
+
|
50 |
+
do_ocr: Annotated[
|
51 |
+
bool,
|
52 |
+
Field(
|
53 |
+
description=(
|
54 |
+
"If enabled, the bitmap content will be processed using OCR. "
|
55 |
+
"Boolean. Optional, defaults to true"
|
56 |
+
),
|
57 |
+
# examples=[True],
|
58 |
+
),
|
59 |
+
] = True
|
60 |
+
|
61 |
+
force_ocr: Annotated[
|
62 |
+
bool,
|
63 |
+
Field(
|
64 |
+
description=(
|
65 |
+
"If enabled, replace existing text with OCR-generated "
|
66 |
+
"text over content. Boolean. Optional, defaults to false."
|
67 |
+
),
|
68 |
+
# examples=[False],
|
69 |
+
),
|
70 |
+
] = False
|
71 |
+
|
72 |
+
# TODO: use a restricted list based on what is installed on the system
|
73 |
+
ocr_engine: Annotated[
|
74 |
+
OcrEngine,
|
75 |
+
Field(
|
76 |
+
description=(
|
77 |
+
"The OCR engine to use. String. "
|
78 |
+
"Allowed values: easyocr, tesseract, rapidocr. "
|
79 |
+
"Optional, defaults to easyocr."
|
80 |
+
),
|
81 |
+
examples=[OcrEngine.EASYOCR],
|
82 |
+
),
|
83 |
+
] = OcrEngine.EASYOCR
|
84 |
+
|
85 |
+
ocr_lang: Annotated[
|
86 |
+
Optional[List[str]],
|
87 |
+
Field(
|
88 |
+
description=(
|
89 |
+
"List of languages used by the OCR engine. "
|
90 |
+
"Note that each OCR engine has "
|
91 |
+
"different values for the language names. String or list of strings. "
|
92 |
+
"Optional, defaults to empty."
|
93 |
+
),
|
94 |
+
examples=[["fr", "de", "es", "en"]],
|
95 |
+
),
|
96 |
+
] = None
|
97 |
+
|
98 |
+
pdf_backend: Annotated[
|
99 |
+
PdfBackend,
|
100 |
+
Field(
|
101 |
+
description=(
|
102 |
+
"The PDF backend to use. String. "
|
103 |
+
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
104 |
+
f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}."
|
105 |
+
),
|
106 |
+
examples=[PdfBackend.DLPARSE_V2],
|
107 |
+
),
|
108 |
+
] = PdfBackend.DLPARSE_V2
|
109 |
+
|
110 |
+
table_mode: Annotated[
|
111 |
+
TableFormerMode,
|
112 |
+
Field(
|
113 |
+
TableFormerMode.FAST,
|
114 |
+
description=(
|
115 |
+
"Mode to use for table structure, String. "
|
116 |
+
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
117 |
+
"Optional, defaults to fast."
|
118 |
+
),
|
119 |
+
examples=[TableFormerMode.FAST],
|
120 |
+
# pattern="fast|accurate",
|
121 |
+
),
|
122 |
+
] = TableFormerMode.FAST
|
123 |
+
|
124 |
+
abort_on_error: Annotated[
|
125 |
+
bool,
|
126 |
+
Field(
|
127 |
+
description=(
|
128 |
+
"Abort on error if enabled. Boolean. Optional, defaults to false."
|
129 |
+
),
|
130 |
+
# examples=[False],
|
131 |
+
),
|
132 |
+
] = False
|
133 |
+
|
134 |
+
return_as_file: Annotated[
|
135 |
+
bool,
|
136 |
+
Field(
|
137 |
+
description=(
|
138 |
+
"Return the output as a zip file "
|
139 |
+
"(will happen anyway if multiple files are generated). "
|
140 |
+
"Boolean. Optional, defaults to false."
|
141 |
+
),
|
142 |
+
examples=[False],
|
143 |
+
),
|
144 |
+
] = False
|
145 |
+
|
146 |
+
do_table_structure: Annotated[
|
147 |
+
bool,
|
148 |
+
Field(
|
149 |
+
description=(
|
150 |
+
"If enabled, the table structure will be extracted. "
|
151 |
+
"Boolean. Optional, defaults to true."
|
152 |
+
),
|
153 |
+
examples=[True],
|
154 |
+
),
|
155 |
+
] = True
|
156 |
+
|
157 |
+
include_images: Annotated[
|
158 |
+
bool,
|
159 |
+
Field(
|
160 |
+
description=(
|
161 |
+
"If enabled, images will be extracted from the document. "
|
162 |
+
"Boolean. Optional, defaults to true."
|
163 |
+
),
|
164 |
+
examples=[True],
|
165 |
+
),
|
166 |
+
] = True
|
167 |
+
|
168 |
+
images_scale: Annotated[
|
169 |
+
float,
|
170 |
+
Field(
|
171 |
+
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
172 |
+
examples=[2.0],
|
173 |
+
),
|
174 |
+
] = 2.0
|
docling_serve/datamodel/engines.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import enum
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
from pydantic import BaseModel
|
5 |
+
|
6 |
+
from docling_serve.datamodel.requests import ConvertDocumentsRequest
|
7 |
+
from docling_serve.datamodel.responses import ConvertDocumentResponse
|
8 |
+
|
9 |
+
|
10 |
+
class TaskStatus(str, enum.Enum):
|
11 |
+
SUCCESS = "success"
|
12 |
+
PENDING = "pending"
|
13 |
+
STARTED = "started"
|
14 |
+
FAILURE = "failure"
|
15 |
+
|
16 |
+
|
17 |
+
class AsyncEngine(str, enum.Enum):
|
18 |
+
LOCAL = "local"
|
19 |
+
|
20 |
+
|
21 |
+
class Task(BaseModel):
|
22 |
+
task_id: str
|
23 |
+
task_status: TaskStatus = TaskStatus.PENDING
|
24 |
+
request: Optional[ConvertDocumentsRequest]
|
25 |
+
result: Optional[ConvertDocumentResponse] = None
|
26 |
+
|
27 |
+
def is_completed(self) -> bool:
|
28 |
+
if self.task_status in [TaskStatus.SUCCESS, TaskStatus.FAILURE]:
|
29 |
+
return True
|
30 |
+
return False
|
docling_serve/datamodel/requests.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
from io import BytesIO
|
3 |
+
from typing import Annotated, Any, Dict, List, Union
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
from docling.datamodel.base_models import DocumentStream
|
8 |
+
|
9 |
+
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
10 |
+
|
11 |
+
|
12 |
+
class DocumentsConvertBase(BaseModel):
|
13 |
+
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
14 |
+
|
15 |
+
|
16 |
+
class HttpSource(BaseModel):
|
17 |
+
url: Annotated[
|
18 |
+
str,
|
19 |
+
Field(
|
20 |
+
description="HTTP url to process",
|
21 |
+
examples=["https://arxiv.org/pdf/2206.01062"],
|
22 |
+
),
|
23 |
+
]
|
24 |
+
headers: Annotated[
|
25 |
+
Dict[str, Any],
|
26 |
+
Field(
|
27 |
+
description="Additional headers used to fetch the urls, "
|
28 |
+
"e.g. authorization, agent, etc"
|
29 |
+
),
|
30 |
+
] = {}
|
31 |
+
|
32 |
+
|
33 |
+
class FileSource(BaseModel):
|
34 |
+
base64_string: Annotated[
|
35 |
+
str,
|
36 |
+
Field(
|
37 |
+
description="Content of the file serialized in base64. "
|
38 |
+
"For example it can be obtained via "
|
39 |
+
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
40 |
+
),
|
41 |
+
]
|
42 |
+
filename: Annotated[
|
43 |
+
str,
|
44 |
+
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
45 |
+
]
|
46 |
+
|
47 |
+
def to_document_stream(self) -> DocumentStream:
|
48 |
+
buf = BytesIO(base64.b64decode(self.base64_string))
|
49 |
+
return DocumentStream(stream=buf, name=self.filename)
|
50 |
+
|
51 |
+
|
52 |
+
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
53 |
+
http_sources: List[HttpSource]
|
54 |
+
|
55 |
+
|
56 |
+
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
57 |
+
file_sources: List[FileSource]
|
58 |
+
|
59 |
+
|
60 |
+
ConvertDocumentsRequest = Union[
|
61 |
+
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
62 |
+
]
|
docling_serve/datamodel/responses.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import enum
|
2 |
+
from typing import Dict, List, Optional
|
3 |
+
|
4 |
+
from pydantic import BaseModel
|
5 |
+
|
6 |
+
from docling.datamodel.document import ConversionStatus, ErrorItem
|
7 |
+
from docling.utils.profiling import ProfilingItem
|
8 |
+
from docling_core.types.doc import DoclingDocument
|
9 |
+
|
10 |
+
|
11 |
+
# Status
|
12 |
+
class HealthCheckResponse(BaseModel):
|
13 |
+
status: str = "ok"
|
14 |
+
|
15 |
+
|
16 |
+
class DocumentResponse(BaseModel):
|
17 |
+
filename: str
|
18 |
+
md_content: Optional[str] = None
|
19 |
+
json_content: Optional[DoclingDocument] = None
|
20 |
+
html_content: Optional[str] = None
|
21 |
+
text_content: Optional[str] = None
|
22 |
+
doctags_content: Optional[str] = None
|
23 |
+
|
24 |
+
|
25 |
+
class ConvertDocumentResponse(BaseModel):
|
26 |
+
document: DocumentResponse
|
27 |
+
status: ConversionStatus
|
28 |
+
errors: List[ErrorItem] = []
|
29 |
+
processing_time: float
|
30 |
+
timings: Dict[str, ProfilingItem] = {}
|
31 |
+
|
32 |
+
|
33 |
+
class ConvertDocumentErrorResponse(BaseModel):
|
34 |
+
status: ConversionStatus
|
35 |
+
|
36 |
+
|
37 |
+
class TaskStatusResponse(BaseModel):
|
38 |
+
task_id: str
|
39 |
+
task_status: str
|
40 |
+
task_position: Optional[int] = None
|
41 |
+
|
42 |
+
|
43 |
+
class MessageKind(str, enum.Enum):
|
44 |
+
CONNECTION = "connection"
|
45 |
+
UPDATE = "update"
|
46 |
+
ERROR = "error"
|
47 |
+
|
48 |
+
|
49 |
+
class WebsocketMessage(BaseModel):
|
50 |
+
message: MessageKind
|
51 |
+
task: Optional[TaskStatusResponse] = None
|
52 |
+
error: Optional[str] = None
|
docling_serve/docling_conversion.py
CHANGED
@@ -1,27 +1,16 @@
|
|
1 |
-
import base64
|
2 |
import hashlib
|
3 |
import json
|
4 |
import logging
|
5 |
-
from io import BytesIO
|
6 |
from pathlib import Path
|
7 |
-
from typing import
|
8 |
-
|
9 |
-
|
10 |
-
Dict,
|
11 |
-
Iterable,
|
12 |
-
Iterator,
|
13 |
-
List,
|
14 |
-
Optional,
|
15 |
-
Tuple,
|
16 |
-
Type,
|
17 |
-
Union,
|
18 |
-
)
|
19 |
|
20 |
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
21 |
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
22 |
from docling.backend.pdf_backend import PdfDocumentBackend
|
23 |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
24 |
-
from docling.datamodel.base_models import DocumentStream, InputFormat
|
25 |
from docling.datamodel.document import ConversionResult
|
26 |
from docling.datamodel.pipeline_options import (
|
27 |
EasyOcrOptions,
|
@@ -35,235 +24,14 @@ from docling.datamodel.pipeline_options import (
|
|
35 |
)
|
36 |
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
37 |
from docling_core.types.doc import ImageRefMode
|
38 |
-
from fastapi import HTTPException
|
39 |
-
from pydantic import BaseModel, Field
|
40 |
|
|
|
41 |
from docling_serve.helper_functions import _to_list_of_strings
|
42 |
from docling_serve.settings import docling_serve_settings
|
43 |
|
44 |
_log = logging.getLogger(__name__)
|
45 |
|
46 |
|
47 |
-
# Define the input options for the API
|
48 |
-
class ConvertDocumentsOptions(BaseModel):
|
49 |
-
from_formats: Annotated[
|
50 |
-
List[InputFormat],
|
51 |
-
Field(
|
52 |
-
description=(
|
53 |
-
"Input format(s) to convert from. String or list of strings. "
|
54 |
-
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
55 |
-
"Optional, defaults to all formats."
|
56 |
-
),
|
57 |
-
examples=[[v.value for v in InputFormat]],
|
58 |
-
),
|
59 |
-
] = list(InputFormat)
|
60 |
-
|
61 |
-
to_formats: Annotated[
|
62 |
-
List[OutputFormat],
|
63 |
-
Field(
|
64 |
-
description=(
|
65 |
-
"Output format(s) to convert to. String or list of strings. "
|
66 |
-
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
67 |
-
"Optional, defaults to Markdown."
|
68 |
-
),
|
69 |
-
examples=[[OutputFormat.MARKDOWN]],
|
70 |
-
),
|
71 |
-
] = [OutputFormat.MARKDOWN]
|
72 |
-
|
73 |
-
image_export_mode: Annotated[
|
74 |
-
ImageRefMode,
|
75 |
-
Field(
|
76 |
-
description=(
|
77 |
-
"Image export mode for the document (in case of JSON,"
|
78 |
-
" Markdown or HTML). "
|
79 |
-
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
80 |
-
"Optional, defaults to Embedded."
|
81 |
-
),
|
82 |
-
examples=[ImageRefMode.EMBEDDED.value],
|
83 |
-
# pattern="embedded|placeholder|referenced",
|
84 |
-
),
|
85 |
-
] = ImageRefMode.EMBEDDED
|
86 |
-
|
87 |
-
do_ocr: Annotated[
|
88 |
-
bool,
|
89 |
-
Field(
|
90 |
-
description=(
|
91 |
-
"If enabled, the bitmap content will be processed using OCR. "
|
92 |
-
"Boolean. Optional, defaults to true"
|
93 |
-
),
|
94 |
-
# examples=[True],
|
95 |
-
),
|
96 |
-
] = True
|
97 |
-
|
98 |
-
force_ocr: Annotated[
|
99 |
-
bool,
|
100 |
-
Field(
|
101 |
-
description=(
|
102 |
-
"If enabled, replace existing text with OCR-generated "
|
103 |
-
"text over content. Boolean. Optional, defaults to false."
|
104 |
-
),
|
105 |
-
# examples=[False],
|
106 |
-
),
|
107 |
-
] = False
|
108 |
-
|
109 |
-
# TODO: use a restricted list based on what is installed on the system
|
110 |
-
ocr_engine: Annotated[
|
111 |
-
OcrEngine,
|
112 |
-
Field(
|
113 |
-
description=(
|
114 |
-
"The OCR engine to use. String. "
|
115 |
-
"Allowed values: easyocr, tesseract, rapidocr. "
|
116 |
-
"Optional, defaults to easyocr."
|
117 |
-
),
|
118 |
-
examples=[OcrEngine.EASYOCR],
|
119 |
-
),
|
120 |
-
] = OcrEngine.EASYOCR
|
121 |
-
|
122 |
-
ocr_lang: Annotated[
|
123 |
-
Optional[List[str]],
|
124 |
-
Field(
|
125 |
-
description=(
|
126 |
-
"List of languages used by the OCR engine. "
|
127 |
-
"Note that each OCR engine has "
|
128 |
-
"different values for the language names. String or list of strings. "
|
129 |
-
"Optional, defaults to empty."
|
130 |
-
),
|
131 |
-
examples=[["fr", "de", "es", "en"]],
|
132 |
-
),
|
133 |
-
] = None
|
134 |
-
|
135 |
-
pdf_backend: Annotated[
|
136 |
-
PdfBackend,
|
137 |
-
Field(
|
138 |
-
description=(
|
139 |
-
"The PDF backend to use. String. "
|
140 |
-
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
141 |
-
f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}."
|
142 |
-
),
|
143 |
-
examples=[PdfBackend.DLPARSE_V2],
|
144 |
-
),
|
145 |
-
] = PdfBackend.DLPARSE_V2
|
146 |
-
|
147 |
-
table_mode: Annotated[
|
148 |
-
TableFormerMode,
|
149 |
-
Field(
|
150 |
-
TableFormerMode.FAST,
|
151 |
-
description=(
|
152 |
-
"Mode to use for table structure, String. "
|
153 |
-
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
154 |
-
"Optional, defaults to fast."
|
155 |
-
),
|
156 |
-
examples=[TableFormerMode.FAST],
|
157 |
-
# pattern="fast|accurate",
|
158 |
-
),
|
159 |
-
] = TableFormerMode.FAST
|
160 |
-
|
161 |
-
abort_on_error: Annotated[
|
162 |
-
bool,
|
163 |
-
Field(
|
164 |
-
description=(
|
165 |
-
"Abort on error if enabled. Boolean. Optional, defaults to false."
|
166 |
-
),
|
167 |
-
# examples=[False],
|
168 |
-
),
|
169 |
-
] = False
|
170 |
-
|
171 |
-
return_as_file: Annotated[
|
172 |
-
bool,
|
173 |
-
Field(
|
174 |
-
description=(
|
175 |
-
"Return the output as a zip file "
|
176 |
-
"(will happen anyway if multiple files are generated). "
|
177 |
-
"Boolean. Optional, defaults to false."
|
178 |
-
),
|
179 |
-
examples=[False],
|
180 |
-
),
|
181 |
-
] = False
|
182 |
-
|
183 |
-
do_table_structure: Annotated[
|
184 |
-
bool,
|
185 |
-
Field(
|
186 |
-
description=(
|
187 |
-
"If enabled, the table structure will be extracted. "
|
188 |
-
"Boolean. Optional, defaults to true."
|
189 |
-
),
|
190 |
-
examples=[True],
|
191 |
-
),
|
192 |
-
] = True
|
193 |
-
|
194 |
-
include_images: Annotated[
|
195 |
-
bool,
|
196 |
-
Field(
|
197 |
-
description=(
|
198 |
-
"If enabled, images will be extracted from the document. "
|
199 |
-
"Boolean. Optional, defaults to true."
|
200 |
-
),
|
201 |
-
examples=[True],
|
202 |
-
),
|
203 |
-
] = True
|
204 |
-
|
205 |
-
images_scale: Annotated[
|
206 |
-
float,
|
207 |
-
Field(
|
208 |
-
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
209 |
-
examples=[2.0],
|
210 |
-
),
|
211 |
-
] = 2.0
|
212 |
-
|
213 |
-
|
214 |
-
class DocumentsConvertBase(BaseModel):
|
215 |
-
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
216 |
-
|
217 |
-
|
218 |
-
class HttpSource(BaseModel):
|
219 |
-
url: Annotated[
|
220 |
-
str,
|
221 |
-
Field(
|
222 |
-
description="HTTP url to process",
|
223 |
-
examples=["https://arxiv.org/pdf/2206.01062"],
|
224 |
-
),
|
225 |
-
]
|
226 |
-
headers: Annotated[
|
227 |
-
Dict[str, Any],
|
228 |
-
Field(
|
229 |
-
description="Additional headers used to fetch the urls, "
|
230 |
-
"e.g. authorization, agent, etc"
|
231 |
-
),
|
232 |
-
] = {}
|
233 |
-
|
234 |
-
|
235 |
-
class FileSource(BaseModel):
|
236 |
-
base64_string: Annotated[
|
237 |
-
str,
|
238 |
-
Field(
|
239 |
-
description="Content of the file serialized in base64. "
|
240 |
-
"For example it can be obtained via "
|
241 |
-
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
242 |
-
),
|
243 |
-
]
|
244 |
-
filename: Annotated[
|
245 |
-
str,
|
246 |
-
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
247 |
-
]
|
248 |
-
|
249 |
-
def to_document_stream(self) -> DocumentStream:
|
250 |
-
buf = BytesIO(base64.b64decode(self.base64_string))
|
251 |
-
return DocumentStream(stream=buf, name=self.filename)
|
252 |
-
|
253 |
-
|
254 |
-
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
255 |
-
http_sources: List[HttpSource]
|
256 |
-
|
257 |
-
|
258 |
-
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
259 |
-
file_sources: List[FileSource]
|
260 |
-
|
261 |
-
|
262 |
-
ConvertDocumentsRequest = Union[
|
263 |
-
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
264 |
-
]
|
265 |
-
|
266 |
-
|
267 |
# Document converters will be preloaded and stored in a dictionary
|
268 |
converters: Dict[bytes, DocumentConverter] = {}
|
269 |
|
|
|
|
|
1 |
import hashlib
|
2 |
import json
|
3 |
import logging
|
|
|
4 |
from pathlib import Path
|
5 |
+
from typing import Any, Dict, Iterable, Iterator, Optional, Tuple, Type, Union
|
6 |
+
|
7 |
+
from fastapi import HTTPException
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
10 |
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
11 |
from docling.backend.pdf_backend import PdfDocumentBackend
|
12 |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
13 |
+
from docling.datamodel.base_models import DocumentStream, InputFormat
|
14 |
from docling.datamodel.document import ConversionResult
|
15 |
from docling.datamodel.pipeline_options import (
|
16 |
EasyOcrOptions,
|
|
|
24 |
)
|
25 |
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
26 |
from docling_core.types.doc import ImageRefMode
|
|
|
|
|
27 |
|
28 |
+
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
29 |
from docling_serve.helper_functions import _to_list_of_strings
|
30 |
from docling_serve.settings import docling_serve_settings
|
31 |
|
32 |
_log = logging.getLogger(__name__)
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
# Document converters will be preloaded and stored in a dictionary
|
36 |
converters: Dict[bytes, DocumentConverter] = {}
|
37 |
|
docling_serve/engines/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import lru_cache
|
2 |
+
|
3 |
+
from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator
|
4 |
+
|
5 |
+
|
6 |
+
@lru_cache
|
7 |
+
def get_orchestrator() -> AsyncLocalOrchestrator:
|
8 |
+
return AsyncLocalOrchestrator()
|
docling_serve/engines/async_local/__init__.py
ADDED
File without changes
|
docling_serve/engines/async_local/orchestrator.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import logging
|
3 |
+
import uuid
|
4 |
+
from typing import Dict, List, Optional, Set
|
5 |
+
|
6 |
+
from fastapi import WebSocket
|
7 |
+
|
8 |
+
from docling_serve.datamodel.engines import Task, TaskStatus
|
9 |
+
from docling_serve.datamodel.requests import ConvertDocumentsRequest
|
10 |
+
from docling_serve.datamodel.responses import (
|
11 |
+
MessageKind,
|
12 |
+
TaskStatusResponse,
|
13 |
+
WebsocketMessage,
|
14 |
+
)
|
15 |
+
from docling_serve.engines.async_local.worker import AsyncLocalWorker
|
16 |
+
from docling_serve.engines.base_orchestrator import BaseOrchestrator
|
17 |
+
from docling_serve.settings import docling_serve_settings
|
18 |
+
|
19 |
+
_log = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
class OrchestratorError(Exception):
|
23 |
+
pass
|
24 |
+
|
25 |
+
|
26 |
+
class TaskNotFoundError(OrchestratorError):
|
27 |
+
pass
|
28 |
+
|
29 |
+
|
30 |
+
class AsyncLocalOrchestrator(BaseOrchestrator):
|
31 |
+
def __init__(self):
|
32 |
+
self.task_queue = asyncio.Queue()
|
33 |
+
self.tasks: Dict[str, Task] = {}
|
34 |
+
self.queue_list: List[str] = []
|
35 |
+
self.task_subscribers: Dict[str, Set[WebSocket]] = {}
|
36 |
+
|
37 |
+
async def enqueue(self, request: ConvertDocumentsRequest) -> Task:
|
38 |
+
task_id = str(uuid.uuid4())
|
39 |
+
task = Task(task_id=task_id, request=request)
|
40 |
+
self.tasks[task_id] = task
|
41 |
+
self.queue_list.append(task_id)
|
42 |
+
self.task_subscribers[task_id] = set()
|
43 |
+
await self.task_queue.put(task_id)
|
44 |
+
return task
|
45 |
+
|
46 |
+
async def queue_size(self) -> int:
|
47 |
+
return self.task_queue.qsize()
|
48 |
+
|
49 |
+
async def get_queue_position(self, task_id: str) -> Optional[int]:
|
50 |
+
return (
|
51 |
+
self.queue_list.index(task_id) + 1 if task_id in self.queue_list else None
|
52 |
+
)
|
53 |
+
|
54 |
+
async def task_status(self, task_id: str, wait: float = 0.0) -> Task:
|
55 |
+
if task_id not in self.tasks:
|
56 |
+
raise TaskNotFoundError()
|
57 |
+
return self.tasks[task_id]
|
58 |
+
|
59 |
+
async def task_result(self, task_id: str):
|
60 |
+
if task_id not in self.tasks:
|
61 |
+
raise TaskNotFoundError()
|
62 |
+
return self.tasks[task_id].result
|
63 |
+
|
64 |
+
async def process_queue(self):
|
65 |
+
# Create a pool of workers
|
66 |
+
workers = []
|
67 |
+
for i in range(docling_serve_settings.eng_loc_num_workers):
|
68 |
+
_log.debug(f"Starting worker {i}")
|
69 |
+
w = AsyncLocalWorker(i, self)
|
70 |
+
worker_task = asyncio.create_task(w.loop())
|
71 |
+
workers.append(worker_task)
|
72 |
+
|
73 |
+
# Wait for all workers to complete (they won't, as they run indefinitely)
|
74 |
+
await asyncio.gather(*workers)
|
75 |
+
_log.debug("All workers completed.")
|
76 |
+
|
77 |
+
async def notify_task_subscribers(self, task_id: str):
|
78 |
+
if task_id not in self.task_subscribers:
|
79 |
+
raise RuntimeError(f"Task {task_id} does not have a subscribers list.")
|
80 |
+
|
81 |
+
task = self.tasks[task_id]
|
82 |
+
task_queue_position = await self.get_queue_position(task_id)
|
83 |
+
msg = TaskStatusResponse(
|
84 |
+
task_id=task.task_id,
|
85 |
+
task_status=task.task_status,
|
86 |
+
task_position=task_queue_position,
|
87 |
+
)
|
88 |
+
for websocket in self.task_subscribers[task_id]:
|
89 |
+
await websocket.send_text(
|
90 |
+
WebsocketMessage(message=MessageKind.UPDATE, task=msg).model_dump_json()
|
91 |
+
)
|
92 |
+
if task.is_completed():
|
93 |
+
await websocket.close()
|
94 |
+
|
95 |
+
async def notify_queue_positions(self):
|
96 |
+
for task_id in self.task_subscribers.keys():
|
97 |
+
# notify only pending tasks
|
98 |
+
if self.tasks[task_id].task_status != TaskStatus.PENDING:
|
99 |
+
continue
|
100 |
+
|
101 |
+
await self.notify_task_subscribers(task_id)
|
docling_serve/engines/async_local/worker.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import logging
|
3 |
+
import time
|
4 |
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
5 |
+
|
6 |
+
from fastapi import BackgroundTasks
|
7 |
+
|
8 |
+
from docling.datamodel.base_models import DocumentStream
|
9 |
+
|
10 |
+
from docling_serve.datamodel.engines import TaskStatus
|
11 |
+
from docling_serve.datamodel.requests import ConvertDocumentFileSourcesRequest
|
12 |
+
from docling_serve.datamodel.responses import ConvertDocumentResponse
|
13 |
+
from docling_serve.docling_conversion import convert_documents
|
14 |
+
from docling_serve.response_preparation import process_results
|
15 |
+
|
16 |
+
if TYPE_CHECKING:
|
17 |
+
from docling_serve.engines.async_local.orchestrator import AsyncLocalOrchestrator
|
18 |
+
|
19 |
+
_log = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
class AsyncLocalWorker:
|
23 |
+
def __init__(self, worker_id: int, orchestrator: "AsyncLocalOrchestrator"):
|
24 |
+
self.worker_id = worker_id
|
25 |
+
self.orchestrator = orchestrator
|
26 |
+
|
27 |
+
async def loop(self):
|
28 |
+
_log.debug(f"Starting loop for worker {self.worker_id}")
|
29 |
+
while True:
|
30 |
+
task_id: str = await self.orchestrator.task_queue.get()
|
31 |
+
self.orchestrator.queue_list.remove(task_id)
|
32 |
+
|
33 |
+
if task_id not in self.orchestrator.tasks:
|
34 |
+
raise RuntimeError(f"Task {task_id} not found.")
|
35 |
+
task = self.orchestrator.tasks[task_id]
|
36 |
+
|
37 |
+
try:
|
38 |
+
task.task_status = TaskStatus.STARTED
|
39 |
+
_log.info(f"Worker {self.worker_id} processing task {task_id}")
|
40 |
+
|
41 |
+
# Notify clients about task updates
|
42 |
+
await self.orchestrator.notify_task_subscribers(task_id)
|
43 |
+
|
44 |
+
# Notify clients about queue updates
|
45 |
+
await self.orchestrator.notify_queue_positions()
|
46 |
+
|
47 |
+
# Get the current event loop
|
48 |
+
asyncio.get_event_loop()
|
49 |
+
|
50 |
+
# Define a callback function to send progress updates to the client.
|
51 |
+
# TODO: send partial updates, e.g. when a document in the batch is done
|
52 |
+
def run_conversion():
|
53 |
+
sources: List[Union[str, DocumentStream]] = []
|
54 |
+
headers: Optional[Dict[str, Any]] = None
|
55 |
+
if isinstance(task.request, ConvertDocumentFileSourcesRequest):
|
56 |
+
for file_source in task.request.file_sources:
|
57 |
+
sources.append(file_source.to_document_stream())
|
58 |
+
else:
|
59 |
+
for http_source in task.request.http_sources:
|
60 |
+
sources.append(http_source.url)
|
61 |
+
if headers is None and http_source.headers:
|
62 |
+
headers = http_source.headers
|
63 |
+
|
64 |
+
# Note: results are only an iterator->lazy evaluation
|
65 |
+
results = convert_documents(
|
66 |
+
sources=sources,
|
67 |
+
options=task.request.options,
|
68 |
+
headers=headers,
|
69 |
+
)
|
70 |
+
|
71 |
+
# The real processing will happen here
|
72 |
+
response = process_results(
|
73 |
+
background_tasks=BackgroundTasks(),
|
74 |
+
conversion_options=task.request.options,
|
75 |
+
conv_results=results,
|
76 |
+
)
|
77 |
+
|
78 |
+
return response
|
79 |
+
|
80 |
+
# Run the prediction in a thread to avoid blocking the event loop.
|
81 |
+
start_time = time.monotonic()
|
82 |
+
# future = asyncio.run_coroutine_threadsafe(
|
83 |
+
# run_conversion(),
|
84 |
+
# loop=loop
|
85 |
+
# )
|
86 |
+
# response = future.result()
|
87 |
+
|
88 |
+
response = await asyncio.to_thread(
|
89 |
+
run_conversion,
|
90 |
+
)
|
91 |
+
processing_time = time.monotonic() - start_time
|
92 |
+
|
93 |
+
if not isinstance(response, ConvertDocumentResponse):
|
94 |
+
_log.error(
|
95 |
+
f"Worker {self.worker_id} got un-processable "
|
96 |
+
"result for {task_id}: {type(response)}"
|
97 |
+
)
|
98 |
+
task.result = response
|
99 |
+
task.request = None
|
100 |
+
|
101 |
+
task.task_status = TaskStatus.SUCCESS
|
102 |
+
_log.info(
|
103 |
+
f"Worker {self.worker_id} completed job {task_id} "
|
104 |
+
f"in {processing_time:.2f} seconds"
|
105 |
+
)
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
_log.error(
|
109 |
+
f"Worker {self.worker_id} failed to process job {task_id}: {e}"
|
110 |
+
)
|
111 |
+
task.task_status = TaskStatus.FAILURE
|
112 |
+
|
113 |
+
finally:
|
114 |
+
await self.orchestrator.notify_task_subscribers(task_id)
|
115 |
+
self.orchestrator.task_queue.task_done()
|
116 |
+
_log.debug(f"Worker {self.worker_id} completely done with {task_id}")
|
docling_serve/engines/base_orchestrator.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
from docling_serve.datamodel.engines import Task
|
4 |
+
|
5 |
+
|
6 |
+
class BaseOrchestrator(ABC):
|
7 |
+
@abstractmethod
|
8 |
+
async def enqueue(self, task) -> Task:
|
9 |
+
pass
|
10 |
+
|
11 |
+
@abstractmethod
|
12 |
+
async def queue_size(self) -> int:
|
13 |
+
pass
|
14 |
+
|
15 |
+
@abstractmethod
|
16 |
+
async def task_status(self, task_id: str) -> Task:
|
17 |
+
pass
|
18 |
+
|
19 |
+
@abstractmethod
|
20 |
+
async def task_result(self, task_id: str):
|
21 |
+
pass
|
docling_serve/engines/block_local/__init__.py
ADDED
File without changes
|
docling_serve/gradio_ui.py
CHANGED
@@ -333,7 +333,6 @@ with gr.Blocks(
|
|
333 |
title="Docling Serve",
|
334 |
delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour
|
335 |
) as ui:
|
336 |
-
|
337 |
# Constants stored in states to be able to pass them as inputs to functions
|
338 |
processing_text = gr.State("Processing your document(s), please wait...")
|
339 |
true_bool = gr.State(True)
|
@@ -593,9 +592,7 @@ with gr.Blocks(
|
|
593 |
set_outputs_visibility_direct,
|
594 |
inputs=[false_bool, false_bool],
|
595 |
outputs=[content_output, file_output],
|
596 |
-
).then(
|
597 |
-
clear_url_input, inputs=None, outputs=[url_input]
|
598 |
-
)
|
599 |
|
600 |
# File processing
|
601 |
file_process_btn.click(
|
@@ -664,6 +661,4 @@ with gr.Blocks(
|
|
664 |
set_outputs_visibility_direct,
|
665 |
inputs=[false_bool, false_bool],
|
666 |
outputs=[content_output, file_output],
|
667 |
-
).then(
|
668 |
-
clear_file_input, inputs=None, outputs=[file_input]
|
669 |
-
)
|
|
|
333 |
title="Docling Serve",
|
334 |
delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour
|
335 |
) as ui:
|
|
|
336 |
# Constants stored in states to be able to pass them as inputs to functions
|
337 |
processing_text = gr.State("Processing your document(s), please wait...")
|
338 |
true_bool = gr.State(True)
|
|
|
592 |
set_outputs_visibility_direct,
|
593 |
inputs=[false_bool, false_bool],
|
594 |
outputs=[content_output, file_output],
|
595 |
+
).then(clear_url_input, inputs=None, outputs=[url_input])
|
|
|
|
|
596 |
|
597 |
# File processing
|
598 |
file_process_btn.click(
|
|
|
661 |
set_outputs_visibility_direct,
|
662 |
inputs=[false_bool, false_bool],
|
663 |
outputs=[content_output, file_output],
|
664 |
+
).then(clear_file_input, inputs=None, outputs=[file_input])
|
|
|
|
docling_serve/response_preparation.py
CHANGED
@@ -4,40 +4,19 @@ import shutil
|
|
4 |
import tempfile
|
5 |
import time
|
6 |
from pathlib import Path
|
7 |
-
from typing import
|
8 |
|
9 |
-
from docling.datamodel.base_models import OutputFormat
|
10 |
-
from docling.datamodel.document import ConversionResult, ConversionStatus, ErrorItem
|
11 |
-
from docling.utils.profiling import ProfilingItem
|
12 |
-
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
13 |
from fastapi import BackgroundTasks, HTTPException
|
14 |
from fastapi.responses import FileResponse
|
15 |
-
from pydantic import BaseModel
|
16 |
-
|
17 |
-
from docling_serve.docling_conversion import ConvertDocumentsOptions
|
18 |
-
|
19 |
-
_log = logging.getLogger(__name__)
|
20 |
-
|
21 |
-
|
22 |
-
class DocumentResponse(BaseModel):
|
23 |
-
filename: str
|
24 |
-
md_content: Optional[str] = None
|
25 |
-
json_content: Optional[DoclingDocument] = None
|
26 |
-
html_content: Optional[str] = None
|
27 |
-
text_content: Optional[str] = None
|
28 |
-
doctags_content: Optional[str] = None
|
29 |
|
|
|
|
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
status: ConversionStatus
|
34 |
-
errors: List[ErrorItem] = []
|
35 |
-
processing_time: float
|
36 |
-
timings: Dict[str, ProfilingItem] = {}
|
37 |
-
|
38 |
|
39 |
-
|
40 |
-
status: ConversionStatus
|
41 |
|
42 |
|
43 |
def _export_document_as_content(
|
@@ -49,7 +28,6 @@ def _export_document_as_content(
|
|
49 |
export_doctags: bool,
|
50 |
image_mode: ImageRefMode,
|
51 |
):
|
52 |
-
|
53 |
document = DocumentResponse(filename=conv_res.input.file.name)
|
54 |
|
55 |
if conv_res.status == ConversionStatus.SUCCESS:
|
@@ -86,7 +64,6 @@ def _export_documents_as_files(
|
|
86 |
export_doctags: bool,
|
87 |
image_export_mode: ImageRefMode,
|
88 |
):
|
89 |
-
|
90 |
success_count = 0
|
91 |
failure_count = 0
|
92 |
|
@@ -150,7 +127,6 @@ def process_results(
|
|
150 |
conversion_options: ConvertDocumentsOptions,
|
151 |
conv_results: Iterable[ConversionResult],
|
152 |
) -> Union[ConvertDocumentResponse, FileResponse]:
|
153 |
-
|
154 |
# Let's start by processing the documents
|
155 |
try:
|
156 |
start_time = time.monotonic()
|
|
|
4 |
import tempfile
|
5 |
import time
|
6 |
from pathlib import Path
|
7 |
+
from typing import Iterable, Union
|
8 |
|
|
|
|
|
|
|
|
|
9 |
from fastapi import BackgroundTasks, HTTPException
|
10 |
from fastapi.responses import FileResponse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
from docling.datamodel.base_models import OutputFormat
|
13 |
+
from docling.datamodel.document import ConversionResult, ConversionStatus
|
14 |
+
from docling_core.types.doc import ImageRefMode
|
15 |
|
16 |
+
from docling_serve.datamodel.convert import ConvertDocumentsOptions
|
17 |
+
from docling_serve.datamodel.responses import ConvertDocumentResponse, DocumentResponse
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
_log = logging.getLogger(__name__)
|
|
|
20 |
|
21 |
|
22 |
def _export_document_as_content(
|
|
|
28 |
export_doctags: bool,
|
29 |
image_mode: ImageRefMode,
|
30 |
):
|
|
|
31 |
document = DocumentResponse(filename=conv_res.input.file.name)
|
32 |
|
33 |
if conv_res.status == ConversionStatus.SUCCESS:
|
|
|
64 |
export_doctags: bool,
|
65 |
image_export_mode: ImageRefMode,
|
66 |
):
|
|
|
67 |
success_count = 0
|
68 |
failure_count = 0
|
69 |
|
|
|
127 |
conversion_options: ConvertDocumentsOptions,
|
128 |
conv_results: Iterable[ConversionResult],
|
129 |
) -> Union[ConvertDocumentResponse, FileResponse]:
|
|
|
130 |
# Let's start by processing the documents
|
131 |
try:
|
132 |
start_time = time.monotonic()
|
docling_serve/settings.py
CHANGED
@@ -3,6 +3,8 @@ from typing import Optional, Union
|
|
3 |
|
4 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
5 |
|
|
|
|
|
6 |
|
7 |
class UvicornSettings(BaseSettings):
|
8 |
model_config = SettingsConfigDict(
|
@@ -28,6 +30,9 @@ class DoclingServeSettings(BaseSettings):
|
|
28 |
enable_ui: bool = False
|
29 |
artifacts_path: Optional[Path] = None
|
30 |
|
|
|
|
|
|
|
31 |
|
32 |
uvicorn_settings = UvicornSettings()
|
33 |
docling_serve_settings = DoclingServeSettings()
|
|
|
3 |
|
4 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
5 |
|
6 |
+
from docling_serve.datamodel.engines import AsyncEngine
|
7 |
+
|
8 |
|
9 |
class UvicornSettings(BaseSettings):
|
10 |
model_config = SettingsConfigDict(
|
|
|
30 |
enable_ui: bool = False
|
31 |
artifacts_path: Optional[Path] = None
|
32 |
|
33 |
+
eng_kind: AsyncEngine = AsyncEngine.LOCAL
|
34 |
+
eng_loc_num_workers: int = 2
|
35 |
+
|
36 |
|
37 |
uvicorn_settings = UvicornSettings()
|
38 |
docling_serve_settings = DoclingServeSettings()
|
pyproject.toml
CHANGED
@@ -30,7 +30,7 @@ classifiers = [
|
|
30 |
]
|
31 |
requires-python = ">=3.10"
|
32 |
dependencies = [
|
33 |
-
"docling~=2.
|
34 |
"fastapi[standard]~=0.115",
|
35 |
"httpx~=0.28",
|
36 |
"pydantic~=2.10",
|
@@ -38,6 +38,7 @@ dependencies = [
|
|
38 |
"python-multipart>=0.0.14,<0.1.0",
|
39 |
"typer~=0.12",
|
40 |
"uvicorn[standard]>=0.29.0,<1.0.0",
|
|
|
41 |
]
|
42 |
|
43 |
[project.optional-dependencies]
|
@@ -164,9 +165,19 @@ ignore = [
|
|
164 |
[tool.ruff.lint.mccabe]
|
165 |
max-complexity = 15
|
166 |
|
|
|
|
|
|
|
167 |
[tool.ruff.lint.isort]
|
168 |
combine-as-imports = true
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
[tool.mypy]
|
172 |
pretty = true
|
@@ -180,10 +191,6 @@ module = [
|
|
180 |
"easyocr.*",
|
181 |
"tesserocr.*",
|
182 |
"rapidocr_onnxruntime.*",
|
183 |
-
"docling_conversion.*",
|
184 |
-
"gradio_ui.*",
|
185 |
-
"response_preparation.*",
|
186 |
-
"helper_functions.*",
|
187 |
"requests.*",
|
188 |
]
|
189 |
ignore_missing_imports = true
|
|
|
30 |
]
|
31 |
requires-python = ">=3.10"
|
32 |
dependencies = [
|
33 |
+
"docling~=2.25.1",
|
34 |
"fastapi[standard]~=0.115",
|
35 |
"httpx~=0.28",
|
36 |
"pydantic~=2.10",
|
|
|
38 |
"python-multipart>=0.0.14,<0.1.0",
|
39 |
"typer~=0.12",
|
40 |
"uvicorn[standard]>=0.29.0,<1.0.0",
|
41 |
+
"websockets~=14.0",
|
42 |
]
|
43 |
|
44 |
[project.optional-dependencies]
|
|
|
165 |
[tool.ruff.lint.mccabe]
|
166 |
max-complexity = 15
|
167 |
|
168 |
+
[tool.ruff.lint.isort.sections]
|
169 |
+
"docling" = ["docling", "docling_core"]
|
170 |
+
|
171 |
[tool.ruff.lint.isort]
|
172 |
combine-as-imports = true
|
173 |
+
section-order = [
|
174 |
+
"future",
|
175 |
+
"standard-library",
|
176 |
+
"third-party",
|
177 |
+
"docling",
|
178 |
+
"first-party",
|
179 |
+
"local-folder",
|
180 |
+
]
|
181 |
|
182 |
[tool.mypy]
|
183 |
pretty = true
|
|
|
191 |
"easyocr.*",
|
192 |
"tesserocr.*",
|
193 |
"rapidocr_onnxruntime.*",
|
|
|
|
|
|
|
|
|
194 |
"requests.*",
|
195 |
]
|
196 |
ignore_missing_imports = true
|
tests/test_1-file-all-outputs.py
CHANGED
@@ -89,7 +89,7 @@ async def test_convert_file(async_client):
|
|
89 |
check.is_in(
|
90 |
'{"schema_name": "DoclingDocument"',
|
91 |
json.dumps(data["document"]["json_content"]),
|
92 |
-
msg=f
|
93 |
)
|
94 |
# HTML check
|
95 |
check.is_in(
|
|
|
89 |
check.is_in(
|
90 |
'{"schema_name": "DoclingDocument"',
|
91 |
json.dumps(data["document"]["json_content"]),
|
92 |
+
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
|
93 |
)
|
94 |
# HTML check
|
95 |
check.is_in(
|
tests/test_1-url-all-outputs.py
CHANGED
@@ -83,7 +83,7 @@ async def test_convert_url(async_client):
|
|
83 |
check.is_in(
|
84 |
'{"schema_name": "DoclingDocument"',
|
85 |
json.dumps(data["document"]["json_content"]),
|
86 |
-
msg=f
|
87 |
)
|
88 |
# HTML check
|
89 |
check.is_in(
|
|
|
83 |
check.is_in(
|
84 |
'{"schema_name": "DoclingDocument"',
|
85 |
json.dumps(data["document"]["json_content"]),
|
86 |
+
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
|
87 |
)
|
88 |
# HTML check
|
89 |
check.is_in(
|
tests/test_1-url-async-ws.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import httpx
|
5 |
+
import pytest
|
6 |
+
import pytest_asyncio
|
7 |
+
from websockets.sync.client import connect
|
8 |
+
|
9 |
+
|
10 |
+
@pytest_asyncio.fixture
|
11 |
+
async def async_client():
|
12 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
13 |
+
yield client
|
14 |
+
|
15 |
+
|
16 |
+
@pytest.mark.asyncio
|
17 |
+
async def test_convert_url(async_client: httpx.AsyncClient):
|
18 |
+
"""Test convert URL to all outputs"""
|
19 |
+
|
20 |
+
doc_filename = Path("tests/2408.09869v5.pdf")
|
21 |
+
encoded_doc = base64.b64encode(doc_filename.read_bytes()).decode()
|
22 |
+
|
23 |
+
base_url = "http://localhost:5001/v1alpha"
|
24 |
+
payload = {
|
25 |
+
"options": {
|
26 |
+
"to_formats": ["md", "json"],
|
27 |
+
"image_export_mode": "placeholder",
|
28 |
+
"ocr": True,
|
29 |
+
"abort_on_error": False,
|
30 |
+
"return_as_file": False,
|
31 |
+
},
|
32 |
+
# "http_sources": [{"url": "https://arxiv.org/pdf/2501.17887"}],
|
33 |
+
"file_sources": [{"base64_string": encoded_doc, "filename": doc_filename.name}],
|
34 |
+
}
|
35 |
+
# print(json.dumps(payload, indent=2))
|
36 |
+
|
37 |
+
for n in range(5):
|
38 |
+
response = await async_client.post(
|
39 |
+
f"{base_url}/convert/source/async", json=payload
|
40 |
+
)
|
41 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
42 |
+
|
43 |
+
task = response.json()
|
44 |
+
|
45 |
+
uri = f"ws://localhost:5001/v1alpha/status/ws/{task['task_id']}"
|
46 |
+
with connect(uri) as websocket:
|
47 |
+
for message in websocket:
|
48 |
+
print(message)
|
tests/test_1-url-async.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
import time
|
4 |
+
|
5 |
+
import httpx
|
6 |
+
import pytest
|
7 |
+
import pytest_asyncio
|
8 |
+
|
9 |
+
|
10 |
+
@pytest_asyncio.fixture
|
11 |
+
async def async_client():
|
12 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
13 |
+
yield client
|
14 |
+
|
15 |
+
|
16 |
+
@pytest.mark.asyncio
|
17 |
+
async def test_convert_url(async_client):
|
18 |
+
"""Test convert URL to all outputs"""
|
19 |
+
|
20 |
+
example_docs = [
|
21 |
+
"https://arxiv.org/pdf/2411.19710",
|
22 |
+
"https://arxiv.org/pdf/2501.17887",
|
23 |
+
"https://www.nature.com/articles/s41467-024-50779-y.pdf",
|
24 |
+
"https://arxiv.org/pdf/2306.12802",
|
25 |
+
"https://arxiv.org/pdf/2311.18481",
|
26 |
+
]
|
27 |
+
|
28 |
+
base_url = "http://localhost:5001/v1alpha"
|
29 |
+
payload = {
|
30 |
+
"options": {
|
31 |
+
"to_formats": ["md", "json"],
|
32 |
+
"image_export_mode": "placeholder",
|
33 |
+
"ocr": True,
|
34 |
+
"abort_on_error": False,
|
35 |
+
"return_as_file": False,
|
36 |
+
},
|
37 |
+
"http_sources": [{"url": random.choice(example_docs)}],
|
38 |
+
}
|
39 |
+
print(json.dumps(payload, indent=2))
|
40 |
+
|
41 |
+
for n in range(5):
|
42 |
+
response = await async_client.post(
|
43 |
+
f"{base_url}/convert/source/async", json=payload
|
44 |
+
)
|
45 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
46 |
+
|
47 |
+
task = response.json()
|
48 |
+
|
49 |
+
print(json.dumps(task, indent=2))
|
50 |
+
|
51 |
+
while task["task_status"] not in ("success", "failure"):
|
52 |
+
response = await async_client.get(f"{base_url}/status/poll/{task['task_id']}")
|
53 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
54 |
+
task = response.json()
|
55 |
+
print(f"{task['task_status']=}")
|
56 |
+
print(f"{task['task_position']=}")
|
57 |
+
|
58 |
+
time.sleep(2)
|
59 |
+
|
60 |
+
assert task["task_status"] == "success"
|
tests/test_2-files-all-outputs.py
CHANGED
@@ -57,18 +57,18 @@ async def test_convert_file(async_client):
|
|
57 |
content_disposition = response.headers.get("content-disposition")
|
58 |
|
59 |
with check:
|
60 |
-
assert (
|
61 |
-
|
62 |
-
)
|
63 |
with check:
|
64 |
assert "attachment" in content_disposition, "Response should be an attachment"
|
65 |
with check:
|
66 |
-
assert (
|
67 |
-
'
|
68 |
-
)
|
69 |
|
70 |
content_type = response.headers.get("content-type")
|
71 |
with check:
|
72 |
-
assert (
|
73 |
-
|
74 |
-
)
|
|
|
57 |
content_disposition = response.headers.get("content-disposition")
|
58 |
|
59 |
with check:
|
60 |
+
assert content_disposition is not None, (
|
61 |
+
"Content-Disposition header should be present"
|
62 |
+
)
|
63 |
with check:
|
64 |
assert "attachment" in content_disposition, "Response should be an attachment"
|
65 |
with check:
|
66 |
+
assert 'filename="converted_docs.zip"' in content_disposition, (
|
67 |
+
"Attachment filename should be 'converted_docs.zip'"
|
68 |
+
)
|
69 |
|
70 |
content_type = response.headers.get("content-type")
|
71 |
with check:
|
72 |
+
assert content_type == "application/zip", (
|
73 |
+
"Content-Type should be 'application/zip'"
|
74 |
+
)
|
tests/test_2-urls-all-outputs.py
CHANGED
@@ -50,18 +50,18 @@ async def test_convert_url(async_client):
|
|
50 |
content_disposition = response.headers.get("content-disposition")
|
51 |
|
52 |
with check:
|
53 |
-
assert (
|
54 |
-
|
55 |
-
)
|
56 |
with check:
|
57 |
assert "attachment" in content_disposition, "Response should be an attachment"
|
58 |
with check:
|
59 |
-
assert (
|
60 |
-
'
|
61 |
-
)
|
62 |
|
63 |
content_type = response.headers.get("content-type")
|
64 |
with check:
|
65 |
-
assert (
|
66 |
-
|
67 |
-
)
|
|
|
50 |
content_disposition = response.headers.get("content-disposition")
|
51 |
|
52 |
with check:
|
53 |
+
assert content_disposition is not None, (
|
54 |
+
"Content-Disposition header should be present"
|
55 |
+
)
|
56 |
with check:
|
57 |
assert "attachment" in content_disposition, "Response should be an attachment"
|
58 |
with check:
|
59 |
+
assert 'filename="converted_docs.zip"' in content_disposition, (
|
60 |
+
"Attachment filename should be 'converted_docs.zip'"
|
61 |
+
)
|
62 |
|
63 |
content_type = response.headers.get("content-type")
|
64 |
with check:
|
65 |
+
assert content_type == "application/zip", (
|
66 |
+
"Content-Type should be 'application/zip'"
|
67 |
+
)
|
uv.lock
CHANGED
@@ -349,38 +349,6 @@ wheels = [
|
|
349 |
{ url = "https://files.pythonhosted.org/packages/2e/38/3fd83c4690dc7d753a442a284b3826ea5e5c380a411443c66421cd823898/cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7", size = 3134657 },
|
350 |
]
|
351 |
|
352 |
-
[[package]]
|
353 |
-
name = "deepsearch-glm"
|
354 |
-
version = "1.0.0"
|
355 |
-
source = { registry = "https://pypi.org/simple" }
|
356 |
-
dependencies = [
|
357 |
-
{ name = "pywin32", marker = "sys_platform == 'win32' or (extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" },
|
358 |
-
]
|
359 |
-
sdist = { url = "https://files.pythonhosted.org/packages/73/d5/a907234e57f5c4f6480c9ddbc3cdacc47f727c768e502be3d361719fac4e/deepsearch_glm-1.0.0.tar.gz", hash = "sha256:e8dce88ac519a693c260f28bd3c4ec409811e65ade84fb508f6c6e37ca065e62", size = 2401014 }
|
360 |
-
wheels = [
|
361 |
-
{ url = "https://files.pythonhosted.org/packages/40/65/4b2013784d5ed8d3664a2efa61f15600c8bf090766b0363c036d78aca550/deepsearch_glm-1.0.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94792b57df7a1c4ba8b47ebd8f36ea0a090d4f27a4fba39bd7b166b6b537260a", size = 6303790 },
|
362 |
-
{ url = "https://files.pythonhosted.org/packages/45/2a/1e95260a712948a21b74dcb239032d9e612f7e1a273657008655749f4115/deepsearch_glm-1.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ff46e352e96a2f56ce7ae4fdf04b271ee841c29ff159b1dec0e5ecaaadba8d4d", size = 5945851 },
|
363 |
-
{ url = "https://files.pythonhosted.org/packages/9e/1a/5c37a98f27644fd02bc447df651e8d5ce484cd6ce7cb178218625b4de5bc/deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d77d3d94d49641888aa15f3ad23e81158e791aa9d9608dd8168dc71788e56f3", size = 7431282 },
|
364 |
-
{ url = "https://files.pythonhosted.org/packages/e8/e2/56b5e7ae3ccc4d8ee758427c8c9a403c985e250a468c53538c269897bef2/deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:143de0fd111a570be12935d8799a2715fe1775d4dc4e256337860b429cee5d36", size = 7759571 },
|
365 |
-
{ url = "https://files.pythonhosted.org/packages/61/f4/e39a5090a2bf0d641449918865566ad5adabef156993a922bdbf4a3ebb60/deepsearch_glm-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9f2872dd573cd2206ce7f9e2e6016c38b66d9ecbd983283ff5e8c6023813c311", size = 7904646 },
|
366 |
-
{ url = "https://files.pythonhosted.org/packages/41/f7/8e8dd9738554f97522b59b0a6d7680ccf2d527bd3471ec4aa4e52acf552a/deepsearch_glm-1.0.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:e64d94ff5209f0a11e8c75c6b28b033ef27b95a22c2fbcbd945e7fe8cc421545", size = 6309301 },
|
367 |
-
{ url = "https://files.pythonhosted.org/packages/17/37/4d8514d8ef851e44513a71f675a7ebb373f109aece38e324c7d444ced20c/deepsearch_glm-1.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a5702205677b768b51f881d15d933370f6ef3c826dfac3b9aa0b904d2e6c495a", size = 5951522 },
|
368 |
-
{ url = "https://files.pythonhosted.org/packages/0c/c6/3680318e66df278fa7f0811dc862d6cb3c328ce168b4f36736eb77120b6d/deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0417a2ae998e1709f03458cfb9adb55423bb1328224eb055300796baa757879f", size = 7434315 },
|
369 |
-
{ url = "https://files.pythonhosted.org/packages/c3/cd/9ffb616d347d568f868f47585b3261c16e277aa7b37740e8720eee71c539/deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0e1efe9af0d28e9b473fe599246deb3a0be7c3d546a478da284747144d086a", size = 7761264 },
|
370 |
-
{ url = "https://files.pythonhosted.org/packages/3d/d3/e5ebdda9cee8a1c846e6a960a0e5b97624aff2f248c2bc89ae490b9a1342/deepsearch_glm-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:807faf13eb0deea55a1951d479a85d5e20de0ff8b2e0b57b2f7939552759a426", size = 7908603 },
|
371 |
-
{ url = "https://files.pythonhosted.org/packages/60/ca/6adbadc979910b11594cd0242f1991942c22528eead431d47de064ac2860/deepsearch_glm-1.0.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:56d9575df9eceb8c2ae33e3d15e133924cc195714c3d268599b6f8414c1f6bb8", size = 6308715 },
|
372 |
-
{ url = "https://files.pythonhosted.org/packages/20/7c/bf1e9c458705c7143c6630cb6847554ad694d25dc6f1f038512b9c86160a/deepsearch_glm-1.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:51f5c6522f60ba73eb12eeb7217bd98d871ba7c078337a4059d05878d8baf2d6", size = 5949609 },
|
373 |
-
{ url = "https://files.pythonhosted.org/packages/21/b1/eb0cd0db50d05f2d7a510a77960e85e6caee727eb3d931ed0ec067917813/deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6211eaf497ad7cfcb68f80f9b5387940be0204fe149a9fc03988a95145f410a", size = 7433929 },
|
374 |
-
{ url = "https://files.pythonhosted.org/packages/3a/7e/2b7db77ff02fe9eec41f3605fcd72e3eb4e6b48561b344d432b417a75cfe/deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b003bf457fce61ea4de79e2d7d0228a1ae349f677eb6570e745f79d4429804f", size = 7760438 },
|
375 |
-
{ url = "https://files.pythonhosted.org/packages/ab/97/ffb2bb5d2432c7b0e9f3a3e6b5873fbcd6e19e82b620393bfb8e01bdecb1/deepsearch_glm-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9d61f66048e6ab60fe9f84c823fd593bf8517755833bd9efb59156d77a2b42d0", size = 7907583 },
|
376 |
-
{ url = "https://files.pythonhosted.org/packages/38/06/08c5fd0e1144c2c8d76d06da1545a9cf589278a37f8b9e6235b5b416eb52/deepsearch_glm-1.0.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:7d558e8b365c27ee665d0589165fd074fb252c73715f9cc6aeb4304a63683f37", size = 6308867 },
|
377 |
-
{ url = "https://files.pythonhosted.org/packages/ba/fb/f5f9787876b67ce83d5afa4903901be9f8071530bc0706dc2228afc0b6c0/deepsearch_glm-1.0.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:3199093a9472e5756214b9b6563f827c19c001c7dd8ae00e03eed1140c12930d", size = 5949719 },
|
378 |
-
{ url = "https://files.pythonhosted.org/packages/83/0f/42b5a4aa798acbc6309d748435b006c489e58102b6cb2278e7b8f0194743/deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f18d1ee68a0479592e0c714e6cbf9e2d0fa8edd692d580da64431c84cbef5c2", size = 7434981 },
|
379 |
-
{ url = "https://files.pythonhosted.org/packages/17/6a/c2c4eaa4470b78dde6c03f055cbb09f3f7f15b8a6ff38f5bea5180339e6f/deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62c1c0ea0a544219da15c017632f9e0be116ecdc335b865c6c5760429557fe23", size = 7760773 },
|
380 |
-
{ url = "https://files.pythonhosted.org/packages/01/0a/7c3cf75bad38a8d6ff3842b78b3263dd81ad4eaf1d859f4b8e1ab465cad5/deepsearch_glm-1.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:962f393dcec2204de1a5cb0f635c65258bde2424ad2d4e0f5df770139c3958de", size = 7908766 },
|
381 |
-
{ url = "https://files.pythonhosted.org/packages/1f/cd/e6507d924aa69e9647f917ed671e2d62e19e41d4f120a15fcbb583661667/deepsearch_glm-1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2315cc4ffe7032dada294a0cd72a47dbc6c0121fd07d4b5719f9a9e9519d091", size = 14644989 },
|
382 |
-
]
|
383 |
-
|
384 |
[[package]]
|
385 |
name = "dill"
|
386 |
version = "0.3.9"
|
@@ -410,12 +378,11 @@ wheels = [
|
|
410 |
|
411 |
[[package]]
|
412 |
name = "docling"
|
413 |
-
version = "2.
|
414 |
source = { registry = "https://pypi.org/simple" }
|
415 |
dependencies = [
|
416 |
{ name = "beautifulsoup4" },
|
417 |
{ name = "certifi" },
|
418 |
-
{ name = "deepsearch-glm" },
|
419 |
{ name = "docling-core", extra = ["chunking"] },
|
420 |
{ name = "docling-ibm-models" },
|
421 |
{ name = "docling-parse" },
|
@@ -438,9 +405,9 @@ dependencies = [
|
|
438 |
{ name = "tqdm" },
|
439 |
{ name = "typer" },
|
440 |
]
|
441 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
442 |
wheels = [
|
443 |
-
{ url = "https://files.pythonhosted.org/packages/
|
444 |
]
|
445 |
|
446 |
[[package]]
|
@@ -472,14 +439,16 @@ chunking = [
|
|
472 |
|
473 |
[[package]]
|
474 |
name = "docling-ibm-models"
|
475 |
-
version = "3.
|
476 |
source = { registry = "https://pypi.org/simple" }
|
477 |
dependencies = [
|
|
|
478 |
{ name = "huggingface-hub" },
|
479 |
{ name = "jsonlines" },
|
480 |
{ name = "numpy" },
|
481 |
{ name = "opencv-python-headless" },
|
482 |
{ name = "pillow" },
|
|
|
483 |
{ name = "safetensors", extra = ["torch"] },
|
484 |
{ name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu') or (platform_machine == 'x86_64' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform != 'darwin' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" },
|
485 |
{ name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (extra != 'extra-13-docling-serve-cpu' and extra != 'extra-13-docling-serve-cu124')" },
|
@@ -492,9 +461,9 @@ dependencies = [
|
|
492 |
{ name = "tqdm" },
|
493 |
{ name = "transformers" },
|
494 |
]
|
495 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
496 |
wheels = [
|
497 |
-
{ url = "https://files.pythonhosted.org/packages/
|
498 |
]
|
499 |
|
500 |
[[package]]
|
@@ -546,14 +515,15 @@ dependencies = [
|
|
546 |
{ name = "python-multipart" },
|
547 |
{ name = "typer" },
|
548 |
{ name = "uvicorn", extra = ["standard"] },
|
|
|
549 |
]
|
550 |
|
551 |
[package.optional-dependencies]
|
552 |
cpu = [
|
553 |
-
{ name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "
|
554 |
-
{ name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "
|
555 |
-
{ name = "torchvision", version = "0.21.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin'
|
556 |
-
{ name = "torchvision", version = "0.21.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'x86_64' and sys_platform == 'darwin'
|
557 |
]
|
558 |
cu124 = [
|
559 |
{ name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" } },
|
@@ -583,7 +553,7 @@ dev = [
|
|
583 |
|
584 |
[package.metadata]
|
585 |
requires-dist = [
|
586 |
-
{ name = "docling", specifier = "~=2.
|
587 |
{ name = "fastapi", extras = ["standard"], specifier = "~=0.115" },
|
588 |
{ name = "gradio", marker = "extra == 'ui'", specifier = "~=5.9" },
|
589 |
{ name = "httpx", specifier = "~=0.28" },
|
@@ -599,6 +569,7 @@ requires-dist = [
|
|
599 |
{ name = "torchvision", marker = "extra == 'cu124'", specifier = ">=0.21.0", index = "https://download.pytorch.org/whl/cu124", conflict = { package = "docling-serve", extra = "cu124" } },
|
600 |
{ name = "typer", specifier = "~=0.12" },
|
601 |
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.29.0,<1.0.0" },
|
|
|
602 |
]
|
603 |
provides-extras = ["ui", "tesserocr", "rapidocr", "cpu", "cu124"]
|
604 |
|
|
|
349 |
{ url = "https://files.pythonhosted.org/packages/2e/38/3fd83c4690dc7d753a442a284b3826ea5e5c380a411443c66421cd823898/cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7", size = 3134657 },
|
350 |
]
|
351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
[[package]]
|
353 |
name = "dill"
|
354 |
version = "0.3.9"
|
|
|
378 |
|
379 |
[[package]]
|
380 |
name = "docling"
|
381 |
+
version = "2.25.1"
|
382 |
source = { registry = "https://pypi.org/simple" }
|
383 |
dependencies = [
|
384 |
{ name = "beautifulsoup4" },
|
385 |
{ name = "certifi" },
|
|
|
386 |
{ name = "docling-core", extra = ["chunking"] },
|
387 |
{ name = "docling-ibm-models" },
|
388 |
{ name = "docling-parse" },
|
|
|
405 |
{ name = "tqdm" },
|
406 |
{ name = "typer" },
|
407 |
]
|
408 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f9/88/b6d782d2cd7ed602d2bae1a01e87a6347a37295ad450d86159cc7c252290/docling-2.25.1.tar.gz", hash = "sha256:ba2fce77659f4ccf1c8a696531ea9f17253215dbebfac6536012bbc6d1c29ce8", size = 112676 }
|
409 |
wheels = [
|
410 |
+
{ url = "https://files.pythonhosted.org/packages/2a/c1/6c58516672f0f60c432ae331391b6548e4fdcb7b6a6dcd7725605284dcf7/docling-2.25.1-py3-none-any.whl", hash = "sha256:92318591342fc50781134fc553c6c57b703ce43e8095a80d59ed02206d0f560c", size = 145677 },
|
411 |
]
|
412 |
|
413 |
[[package]]
|
|
|
439 |
|
440 |
[[package]]
|
441 |
name = "docling-ibm-models"
|
442 |
+
version = "3.4.1"
|
443 |
source = { registry = "https://pypi.org/simple" }
|
444 |
dependencies = [
|
445 |
+
{ name = "docling-core" },
|
446 |
{ name = "huggingface-hub" },
|
447 |
{ name = "jsonlines" },
|
448 |
{ name = "numpy" },
|
449 |
{ name = "opencv-python-headless" },
|
450 |
{ name = "pillow" },
|
451 |
+
{ name = "pydantic" },
|
452 |
{ name = "safetensors", extra = ["torch"] },
|
453 |
{ name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin' and extra == 'extra-13-docling-serve-cpu') or (platform_machine == 'x86_64' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (sys_platform != 'darwin' and extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124')" },
|
454 |
{ name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-docling-serve-cpu' and extra == 'extra-13-docling-serve-cu124') or (extra != 'extra-13-docling-serve-cpu' and extra != 'extra-13-docling-serve-cu124')" },
|
|
|
461 |
{ name = "tqdm" },
|
462 |
{ name = "transformers" },
|
463 |
]
|
464 |
+
sdist = { url = "https://files.pythonhosted.org/packages/eb/a5/88d5b7c970d5e10a06062fe9e9de3cde6acdefcc1f85854f689a82863c2a/docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96", size = 69794 }
|
465 |
wheels = [
|
466 |
+
{ url = "https://files.pythonhosted.org/packages/af/8f/0f2b823fa09d06deacbdfc6d5d7809d462ddc508f43146960083d113c4c6/docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986", size = 80886 },
|
467 |
]
|
468 |
|
469 |
[[package]]
|
|
|
515 |
{ name = "python-multipart" },
|
516 |
{ name = "typer" },
|
517 |
{ name = "uvicorn", extra = ["standard"] },
|
518 |
+
{ name = "websockets" },
|
519 |
]
|
520 |
|
521 |
[package.optional-dependencies]
|
522 |
cpu = [
|
523 |
+
{ name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "platform_machine != 'x86_64' and sys_platform == 'darwin'" },
|
524 |
+
{ name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "platform_machine == 'x86_64' or sys_platform != 'darwin'" },
|
525 |
+
{ name = "torchvision", version = "0.21.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux')" },
|
526 |
+
{ name = "torchvision", version = "0.21.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
527 |
]
|
528 |
cu124 = [
|
529 |
{ name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" } },
|
|
|
553 |
|
554 |
[package.metadata]
|
555 |
requires-dist = [
|
556 |
+
{ name = "docling", specifier = "~=2.25.1" },
|
557 |
{ name = "fastapi", extras = ["standard"], specifier = "~=0.115" },
|
558 |
{ name = "gradio", marker = "extra == 'ui'", specifier = "~=5.9" },
|
559 |
{ name = "httpx", specifier = "~=0.28" },
|
|
|
569 |
{ name = "torchvision", marker = "extra == 'cu124'", specifier = ">=0.21.0", index = "https://download.pytorch.org/whl/cu124", conflict = { package = "docling-serve", extra = "cu124" } },
|
570 |
{ name = "typer", specifier = "~=0.12" },
|
571 |
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.29.0,<1.0.0" },
|
572 |
+
{ name = "websockets", specifier = "~=14.0" },
|
573 |
]
|
574 |
provides-extras = ["ui", "tesserocr", "rapidocr", "cpu", "cu124"]
|
575 |
|