Spaces:
Configuration error
Configuration error
Michele Dolfi
commited on
chore: Remove deprecated type aliases and run as pre-commit (#79)
Browse files- docling_serve/app.py +4 -4
- docling_serve/datamodel/convert.py +4 -4
- docling_serve/datamodel/requests.py +4 -4
- docling_serve/datamodel/responses.py +3 -3
- docling_serve/docling_conversion.py +7 -6
- docling_serve/engines/async_local/orchestrator.py +4 -4
- docling_serve/engines/async_local/worker.py +3 -3
- docling_serve/helper_functions.py +4 -4
- docling_serve/response_preparation.py +2 -1
- pyproject.toml +3 -1
docling_serve/app.py
CHANGED
@@ -5,7 +5,7 @@ import tempfile
|
|
5 |
from contextlib import asynccontextmanager
|
6 |
from io import BytesIO
|
7 |
from pathlib import Path
|
8 |
-
from typing import Annotated, Any,
|
9 |
|
10 |
from fastapi import (
|
11 |
BackgroundTasks,
|
@@ -205,8 +205,8 @@ def create_app(): # noqa: C901
|
|
205 |
def process_url(
|
206 |
background_tasks: BackgroundTasks, conversion_request: ConvertDocumentsRequest
|
207 |
):
|
208 |
-
sources:
|
209 |
-
headers: Optional[
|
210 |
if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
|
211 |
for file_source in conversion_request.file_sources:
|
212 |
sources.append(file_source.to_document_stream())
|
@@ -242,7 +242,7 @@ def create_app(): # noqa: C901
|
|
242 |
)
|
243 |
async def process_file(
|
244 |
background_tasks: BackgroundTasks,
|
245 |
-
files:
|
246 |
options: Annotated[
|
247 |
ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)
|
248 |
],
|
|
|
5 |
from contextlib import asynccontextmanager
|
6 |
from io import BytesIO
|
7 |
from pathlib import Path
|
8 |
+
from typing import Annotated, Any, Optional, Union
|
9 |
|
10 |
from fastapi import (
|
11 |
BackgroundTasks,
|
|
|
205 |
def process_url(
|
206 |
background_tasks: BackgroundTasks, conversion_request: ConvertDocumentsRequest
|
207 |
):
|
208 |
+
sources: list[Union[str, DocumentStream]] = []
|
209 |
+
headers: Optional[dict[str, Any]] = None
|
210 |
if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
|
211 |
for file_source in conversion_request.file_sources:
|
212 |
sources.append(file_source.to_document_stream())
|
|
|
242 |
)
|
243 |
async def process_file(
|
244 |
background_tasks: BackgroundTasks,
|
245 |
+
files: list[UploadFile],
|
246 |
options: Annotated[
|
247 |
ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)
|
248 |
],
|
docling_serve/datamodel/convert.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# Define the input options for the API
|
2 |
-
from typing import Annotated,
|
3 |
|
4 |
from pydantic import BaseModel, Field
|
5 |
|
@@ -10,7 +10,7 @@ from docling_core.types.doc import ImageRefMode
|
|
10 |
|
11 |
class ConvertDocumentsOptions(BaseModel):
|
12 |
from_formats: Annotated[
|
13 |
-
|
14 |
Field(
|
15 |
description=(
|
16 |
"Input format(s) to convert from. String or list of strings. "
|
@@ -22,7 +22,7 @@ class ConvertDocumentsOptions(BaseModel):
|
|
22 |
] = list(InputFormat)
|
23 |
|
24 |
to_formats: Annotated[
|
25 |
-
|
26 |
Field(
|
27 |
description=(
|
28 |
"Output format(s) to convert to. String or list of strings. "
|
@@ -83,7 +83,7 @@ class ConvertDocumentsOptions(BaseModel):
|
|
83 |
] = OcrEngine.EASYOCR
|
84 |
|
85 |
ocr_lang: Annotated[
|
86 |
-
Optional[
|
87 |
Field(
|
88 |
description=(
|
89 |
"List of languages used by the OCR engine. "
|
|
|
1 |
# Define the input options for the API
|
2 |
+
from typing import Annotated, Optional
|
3 |
|
4 |
from pydantic import BaseModel, Field
|
5 |
|
|
|
10 |
|
11 |
class ConvertDocumentsOptions(BaseModel):
|
12 |
from_formats: Annotated[
|
13 |
+
list[InputFormat],
|
14 |
Field(
|
15 |
description=(
|
16 |
"Input format(s) to convert from. String or list of strings. "
|
|
|
22 |
] = list(InputFormat)
|
23 |
|
24 |
to_formats: Annotated[
|
25 |
+
list[OutputFormat],
|
26 |
Field(
|
27 |
description=(
|
28 |
"Output format(s) to convert to. String or list of strings. "
|
|
|
83 |
] = OcrEngine.EASYOCR
|
84 |
|
85 |
ocr_lang: Annotated[
|
86 |
+
Optional[list[str]],
|
87 |
Field(
|
88 |
description=(
|
89 |
"List of languages used by the OCR engine. "
|
docling_serve/datamodel/requests.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import base64
|
2 |
from io import BytesIO
|
3 |
-
from typing import Annotated, Any,
|
4 |
|
5 |
from pydantic import BaseModel, Field
|
6 |
|
@@ -22,7 +22,7 @@ class HttpSource(BaseModel):
|
|
22 |
),
|
23 |
]
|
24 |
headers: Annotated[
|
25 |
-
|
26 |
Field(
|
27 |
description="Additional headers used to fetch the urls, "
|
28 |
"e.g. authorization, agent, etc"
|
@@ -50,11 +50,11 @@ class FileSource(BaseModel):
|
|
50 |
|
51 |
|
52 |
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
53 |
-
http_sources:
|
54 |
|
55 |
|
56 |
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
57 |
-
file_sources:
|
58 |
|
59 |
|
60 |
ConvertDocumentsRequest = Union[
|
|
|
1 |
import base64
|
2 |
from io import BytesIO
|
3 |
+
from typing import Annotated, Any, Union
|
4 |
|
5 |
from pydantic import BaseModel, Field
|
6 |
|
|
|
22 |
),
|
23 |
]
|
24 |
headers: Annotated[
|
25 |
+
dict[str, Any],
|
26 |
Field(
|
27 |
description="Additional headers used to fetch the urls, "
|
28 |
"e.g. authorization, agent, etc"
|
|
|
50 |
|
51 |
|
52 |
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
53 |
+
http_sources: list[HttpSource]
|
54 |
|
55 |
|
56 |
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
57 |
+
file_sources: list[FileSource]
|
58 |
|
59 |
|
60 |
ConvertDocumentsRequest = Union[
|
docling_serve/datamodel/responses.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import enum
|
2 |
-
from typing import
|
3 |
|
4 |
from pydantic import BaseModel
|
5 |
|
@@ -25,9 +25,9 @@ class DocumentResponse(BaseModel):
|
|
25 |
class ConvertDocumentResponse(BaseModel):
|
26 |
document: DocumentResponse
|
27 |
status: ConversionStatus
|
28 |
-
errors:
|
29 |
processing_time: float
|
30 |
-
timings:
|
31 |
|
32 |
|
33 |
class ConvertDocumentErrorResponse(BaseModel):
|
|
|
1 |
import enum
|
2 |
+
from typing import Optional
|
3 |
|
4 |
from pydantic import BaseModel
|
5 |
|
|
|
25 |
class ConvertDocumentResponse(BaseModel):
|
26 |
document: DocumentResponse
|
27 |
status: ConversionStatus
|
28 |
+
errors: list[ErrorItem] = []
|
29 |
processing_time: float
|
30 |
+
timings: dict[str, ProfilingItem] = {}
|
31 |
|
32 |
|
33 |
class ConvertDocumentErrorResponse(BaseModel):
|
docling_serve/docling_conversion.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import hashlib
|
2 |
import json
|
3 |
import logging
|
|
|
4 |
from pathlib import Path
|
5 |
-
from typing import Any,
|
6 |
|
7 |
from fastapi import HTTPException
|
8 |
|
@@ -33,7 +34,7 @@ _log = logging.getLogger(__name__)
|
|
33 |
|
34 |
|
35 |
# Document converters will be preloaded and stored in a dictionary
|
36 |
-
converters:
|
37 |
|
38 |
|
39 |
# Custom serializer for PdfFormatOption
|
@@ -69,7 +70,7 @@ def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
|
69 |
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
70 |
def get_pdf_pipeline_opts( # noqa: C901
|
71 |
request: ConvertDocumentsOptions,
|
72 |
-
) ->
|
73 |
if request.ocr_engine == OcrEngine.EASYOCR:
|
74 |
try:
|
75 |
import easyocr # noqa: F401
|
@@ -129,7 +130,7 @@ def get_pdf_pipeline_opts( # noqa: C901
|
|
129 |
pipeline_options.images_scale = request.images_scale
|
130 |
|
131 |
if request.pdf_backend == PdfBackend.DLPARSE_V1:
|
132 |
-
backend:
|
133 |
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
|
134 |
backend = DoclingParseV2DocumentBackend
|
135 |
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
|
@@ -177,12 +178,12 @@ def get_pdf_pipeline_opts( # noqa: C901
|
|
177 |
def convert_documents(
|
178 |
sources: Iterable[Union[Path, str, DocumentStream]],
|
179 |
options: ConvertDocumentsOptions,
|
180 |
-
headers: Optional[
|
181 |
):
|
182 |
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
183 |
|
184 |
if options_hash not in converters:
|
185 |
-
format_options:
|
186 |
InputFormat.PDF: pdf_format_option,
|
187 |
InputFormat.IMAGE: pdf_format_option,
|
188 |
}
|
|
|
1 |
import hashlib
|
2 |
import json
|
3 |
import logging
|
4 |
+
from collections.abc import Iterable, Iterator
|
5 |
from pathlib import Path
|
6 |
+
from typing import Any, Optional, Union
|
7 |
|
8 |
from fastapi import HTTPException
|
9 |
|
|
|
34 |
|
35 |
|
36 |
# Document converters will be preloaded and stored in a dictionary
|
37 |
+
converters: dict[bytes, DocumentConverter] = {}
|
38 |
|
39 |
|
40 |
# Custom serializer for PdfFormatOption
|
|
|
70 |
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
71 |
def get_pdf_pipeline_opts( # noqa: C901
|
72 |
request: ConvertDocumentsOptions,
|
73 |
+
) -> tuple[PdfFormatOption, bytes]:
|
74 |
if request.ocr_engine == OcrEngine.EASYOCR:
|
75 |
try:
|
76 |
import easyocr # noqa: F401
|
|
|
130 |
pipeline_options.images_scale = request.images_scale
|
131 |
|
132 |
if request.pdf_backend == PdfBackend.DLPARSE_V1:
|
133 |
+
backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
134 |
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
|
135 |
backend = DoclingParseV2DocumentBackend
|
136 |
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
|
|
|
178 |
def convert_documents(
|
179 |
sources: Iterable[Union[Path, str, DocumentStream]],
|
180 |
options: ConvertDocumentsOptions,
|
181 |
+
headers: Optional[dict[str, Any]] = None,
|
182 |
):
|
183 |
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
184 |
|
185 |
if options_hash not in converters:
|
186 |
+
format_options: dict[InputFormat, FormatOption] = {
|
187 |
InputFormat.PDF: pdf_format_option,
|
188 |
InputFormat.IMAGE: pdf_format_option,
|
189 |
}
|
docling_serve/engines/async_local/orchestrator.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import asyncio
|
2 |
import logging
|
3 |
import uuid
|
4 |
-
from typing import
|
5 |
|
6 |
from fastapi import WebSocket
|
7 |
|
@@ -30,9 +30,9 @@ class TaskNotFoundError(OrchestratorError):
|
|
30 |
class AsyncLocalOrchestrator(BaseOrchestrator):
|
31 |
def __init__(self):
|
32 |
self.task_queue = asyncio.Queue()
|
33 |
-
self.tasks:
|
34 |
-
self.queue_list:
|
35 |
-
self.task_subscribers:
|
36 |
|
37 |
async def enqueue(self, request: ConvertDocumentsRequest) -> Task:
|
38 |
task_id = str(uuid.uuid4())
|
|
|
1 |
import asyncio
|
2 |
import logging
|
3 |
import uuid
|
4 |
+
from typing import Optional
|
5 |
|
6 |
from fastapi import WebSocket
|
7 |
|
|
|
30 |
class AsyncLocalOrchestrator(BaseOrchestrator):
|
31 |
def __init__(self):
|
32 |
self.task_queue = asyncio.Queue()
|
33 |
+
self.tasks: dict[str, Task] = {}
|
34 |
+
self.queue_list: list[str] = []
|
35 |
+
self.task_subscribers: dict[str, set[WebSocket]] = {}
|
36 |
|
37 |
async def enqueue(self, request: ConvertDocumentsRequest) -> Task:
|
38 |
task_id = str(uuid.uuid4())
|
docling_serve/engines/async_local/worker.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import asyncio
|
2 |
import logging
|
3 |
import time
|
4 |
-
from typing import TYPE_CHECKING, Any,
|
5 |
|
6 |
from fastapi import BackgroundTasks
|
7 |
|
@@ -50,8 +50,8 @@ class AsyncLocalWorker:
|
|
50 |
# Define a callback function to send progress updates to the client.
|
51 |
# TODO: send partial updates, e.g. when a document in the batch is done
|
52 |
def run_conversion():
|
53 |
-
sources:
|
54 |
-
headers: Optional[
|
55 |
if isinstance(task.request, ConvertDocumentFileSourcesRequest):
|
56 |
for file_source in task.request.file_sources:
|
57 |
sources.append(file_source.to_document_stream())
|
|
|
1 |
import asyncio
|
2 |
import logging
|
3 |
import time
|
4 |
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
5 |
|
6 |
from fastapi import BackgroundTasks
|
7 |
|
|
|
50 |
# Define a callback function to send progress updates to the client.
|
51 |
# TODO: send partial updates, e.g. when a document in the batch is done
|
52 |
def run_conversion():
|
53 |
+
sources: list[Union[str, DocumentStream]] = []
|
54 |
+
headers: Optional[dict[str, Any]] = None
|
55 |
if isinstance(task.request, ConvertDocumentFileSourcesRequest):
|
56 |
for file_source in task.request.file_sources:
|
57 |
sources.append(file_source.to_document_stream())
|
docling_serve/helper_functions.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import inspect
|
2 |
import re
|
3 |
-
from typing import
|
4 |
|
5 |
from fastapi import Depends, Form
|
6 |
from pydantic import BaseModel
|
@@ -8,7 +8,7 @@ from pydantic import BaseModel
|
|
8 |
|
9 |
# Adapted from
|
10 |
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
11 |
-
def FormDepends(cls:
|
12 |
new_parameters = []
|
13 |
|
14 |
for field_name, model_field in cls.model_fields.items():
|
@@ -34,8 +34,8 @@ def FormDepends(cls: Type[BaseModel]):
|
|
34 |
return Depends(as_form_func)
|
35 |
|
36 |
|
37 |
-
def _to_list_of_strings(input_value: Union[str,
|
38 |
-
def split_and_strip(value: str) ->
|
39 |
if re.search(r"[;,]", value):
|
40 |
return [item.strip() for item in re.split(r"[;,]", value)]
|
41 |
else:
|
|
|
1 |
import inspect
|
2 |
import re
|
3 |
+
from typing import Union
|
4 |
|
5 |
from fastapi import Depends, Form
|
6 |
from pydantic import BaseModel
|
|
|
8 |
|
9 |
# Adapted from
|
10 |
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
11 |
+
def FormDepends(cls: type[BaseModel]):
|
12 |
new_parameters = []
|
13 |
|
14 |
for field_name, model_field in cls.model_fields.items():
|
|
|
34 |
return Depends(as_form_func)
|
35 |
|
36 |
|
37 |
+
def _to_list_of_strings(input_value: Union[str, list[str]]) -> list[str]:
|
38 |
+
def split_and_strip(value: str) -> list[str]:
|
39 |
if re.search(r"[;,]", value):
|
40 |
return [item.strip() for item in re.split(r"[;,]", value)]
|
41 |
else:
|
docling_serve/response_preparation.py
CHANGED
@@ -3,8 +3,9 @@ import os
|
|
3 |
import shutil
|
4 |
import tempfile
|
5 |
import time
|
|
|
6 |
from pathlib import Path
|
7 |
-
from typing import
|
8 |
|
9 |
from fastapi import BackgroundTasks, HTTPException
|
10 |
from fastapi.responses import FileResponse
|
|
|
3 |
import shutil
|
4 |
import tempfile
|
5 |
import time
|
6 |
+
from collections.abc import Iterable
|
7 |
from pathlib import Path
|
8 |
+
from typing import Union
|
9 |
|
10 |
from fastapi import BackgroundTasks, HTTPException
|
11 |
from fastapi.responses import FileResponse
|
pyproject.toml
CHANGED
@@ -145,7 +145,8 @@ select = [
|
|
145 |
"S307", # eval
|
146 |
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
147 |
"W", # pycodestyle warnings
|
148 |
-
"ASYNC" # async
|
|
|
149 |
]
|
150 |
|
151 |
ignore = [
|
@@ -154,6 +155,7 @@ ignore = [
|
|
154 |
"F811", # "redefinition of the same function"
|
155 |
"PL", # Pylint
|
156 |
"RUF012", # Mutable Class Attributes
|
|
|
157 |
]
|
158 |
|
159 |
#extend-select = []
|
|
|
145 |
"S307", # eval
|
146 |
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
147 |
"W", # pycodestyle warnings
|
148 |
+
"ASYNC", # async
|
149 |
+
"UP", # pyupgrade
|
150 |
]
|
151 |
|
152 |
ignore = [
|
|
|
155 |
"F811", # "redefinition of the same function"
|
156 |
"PL", # Pylint
|
157 |
"RUF012", # Mutable Class Attributes
|
158 |
+
"UP007", # Option and Union
|
159 |
]
|
160 |
|
161 |
#extend-select = []
|