Spaces:

CerealDev
/

Docling-UI

Configuration error

Michele Dolfi commited on Dec 19, 2024

Commit

84568d7

unverified ·

1 Parent(s): 3d47a36

feat: upgrade endpoint to docling v2 (#13)

* upgrade endpoint to docling v2

Signed-off-by: Michele Dolfi <[email protected]>

* fix Containerfile

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>

Files changed (5) hide show

Containerfile +2 -2
docling_serve/app.py +214 -31
docling_serve/settings.py +0 -2
poetry.lock +0 -0
pyproject.toml +35 -6

Containerfile CHANGED Viewed

@@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
 ENV HF_HOME=/tmp/
 ENV TORCH_HOME=/tmp/
-RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve
 EXPOSE 5000
-CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]

 ENV HF_HOME=/tmp/
 ENV TORCH_HOME=/tmp/
+RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
 EXPOSE 5000
+CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]

docling_serve/app.py CHANGED Viewed

@@ -1,21 +1,55 @@
 import base64
 from contextlib import asynccontextmanager
 from io import BytesIO
-from pathlib import Path
-from typing import Any, Dict, Union
 import httpx
 from docling.datamodel.base_models import (
     ConversionStatus,
     DocumentStream,
-    PipelineOptions,
 )
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from docling_serve.settings import Settings
 class HttpSource(BaseModel):
@@ -28,16 +62,30 @@ class FileSource(BaseModel):
     filename: str
-class ConvertDocumentHttpSourceRequest(BaseModel):
     http_source: HttpSource
-class ConvertDocumentFileSourceRequest(BaseModel):
     file_source: FileSource
 class ConvertDocumentResponse(BaseModel):
-    content_md: str
 ConvertDocumentRequest = Union[
@@ -45,20 +93,93 @@ ConvertDocumentRequest = Union[
 ]
-models = {}
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Converter
-    settings = Settings()
-    pipeline_options = PipelineOptions()
-    pipeline_options.do_ocr = settings.do_ocr
-    pipeline_options.do_table_structure = settings.do_table_structure
-    models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
     yield
-    models.clear()
 app = FastAPI(
@@ -67,10 +188,14 @@ app = FastAPI(
 )
-@app.post("/convert")
-def convert_pdf_document(
     body: ConvertDocumentRequest,
-) -> ConvertDocumentResponse:
     filename: str
     buf: BytesIO
@@ -81,16 +206,74 @@ def convert_pdf_document(
     elif isinstance(body, ConvertDocumentHttpSourceRequest):
         http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
         buf = BytesIO(http_res.content)
-        filename = Path(
-            body.http_source.url
-        ).name  # TODO: use better way to detect filename, e.g. from Content-Disposition
-    docs_input = DocumentConversionInput.from_streams(
-        [DocumentStream(filename=filename, stream=buf)]
     )
-    result: ConversionResult = next(models["converter"].convert(docs_input), None)
-    if result is None or result.status != ConversionStatus.SUCCESS:
-        raise HTTPException(status_code=500, detail={"errors": result.errors})
-    return ConvertDocumentResponse(content_md=result.render_as_markdown())

 import base64
+import hashlib
 from contextlib import asynccontextmanager
+from enum import Enum
 from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple, Union
 import httpx
 from docling.datamodel.base_models import (
     ConversionStatus,
     DocumentStream,
+    ErrorItem,
+    InputFormat,
 )
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+    PdfPipelineOptions,
+    RapidOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.utils.profiling import ProfilingItem
+from docling_core.types.doc import DoclingDocument, ImageRefMode
+from docling_core.utils.file import resolve_remote_filename
+from fastapi import FastAPI, HTTPException, Response
+from pydantic import AnyHttpUrl, BaseModel
+# TODO: import enum from Docling, once it is exposed
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT = "tesseract"
+    RAPIDOCR = "rapidocr"
+class ConvertOptions(BaseModel):
+    output_docling_document: bool = True
+    output_markdown: bool = False
+    output_html: bool = False
+    do_ocr: bool = True
+    ocr_engine: OcrEngine = OcrEngine.EASYOCR
+    ocr_lang: Optional[List[str]] = None
+    force_ocr: bool = False
+    do_table_structure: bool = True
+    include_images: bool = True
+    images_scale: float = 2.0
+class DocumentConvertBase(BaseModel):
+    options: ConvertOptions = ConvertOptions()
 class HttpSource(BaseModel):
     filename: str
+class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
     http_source: HttpSource
+class ConvertDocumentFileSourceRequest(DocumentConvertBase):
     file_source: FileSource
+class DocumentResponse(BaseModel):
+    markdown: Optional[str] = None
+    docling_document: Optional[DoclingDocument] = None
+    html: Optional[str] = None
 class ConvertDocumentResponse(BaseModel):
+    document: DocumentResponse
+    status: ConversionStatus
+    errors: List[ErrorItem] = []
+    timings: Dict[str, ProfilingItem] = {}
+class ConvertDocumentErrorResponse(BaseModel):
+    status: ConversionStatus
+    # errors: List[ErrorItem] = []
 ConvertDocumentRequest = Union[
 ]
+class MarkdownTextResponse(Response):
+    media_type = "text/markdown"
+class HealthCheckResponse(BaseModel):
+    status: str = "ok"
+def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
+    if options.ocr_engine == OcrEngine.EASYOCR:
+        try:
+            import easyocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
+    elif options.ocr_engine == OcrEngine.TESSERACT:
+        try:
+            import tesserocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
+    elif options.ocr_engine == OcrEngine.RAPIDOCR:
+        try:
+            from rapidocr_onnxruntime import RapidOCR  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
+    else:
+        raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
+    if options.ocr_lang is not None:
+        ocr_options.lang = options.ocr_lang
+    pipeline_options = PdfPipelineOptions(
+        do_ocr=options.do_ocr,
+        ocr_options=ocr_options,
+        do_table_structure=options.do_table_structure,
+        generate_page_images=options.include_images,
+        generate_picture_images=options.include_images,
+        images_scale=options.images_scale,
+    )
+    options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
+    return pipeline_options, options_hash
+converters: Dict[str, DocumentConverter] = {}
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    # settings = Settings()
+    # Converter with default options
+    pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
+    converters[options_hash] = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+            InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+        }
+    )
+    converters[options_hash].initialize_pipeline(InputFormat.PDF)
     yield
+    converters.clear()
 app = FastAPI(
 )
+@app.get("/health")
+def health() -> HealthCheckResponse:
+    return HealthCheckResponse()
+def _convert_document(
     body: ConvertDocumentRequest,
+) -> ConversionResult:
     filename: str
     buf: BytesIO
     elif isinstance(body, ConvertDocumentHttpSourceRequest):
         http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
         buf = BytesIO(http_res.content)
+        filename = resolve_remote_filename(
+            http_url=AnyHttpUrl(body.http_source.url),
+            response_headers=dict(**http_res.headers),
+        )
+    doc_input = DocumentStream(name=filename, stream=buf)
+    pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
+    if options_hash not in converters:
+        converters[options_hash] = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+                InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+            }
+        )
+    result: ConversionResult = converters[options_hash].convert(doc_input)
+    if result is None or result.status == ConversionStatus.SKIPPED:
+        raise HTTPException(status_code=400, detail=result.errors)
+    if result is None or result.status not in {
+        ConversionStatus.SUCCESS,
+    }:
+        raise HTTPException(
+            status_code=500, detail={"errors": result.errors, "status": result.status}
+        )
+    return result
+@app.post(
+    "/convert",
+)
+def convert_document(
+    body: ConvertDocumentRequest,
+) -> ConvertDocumentResponse:
+    result = _convert_document(body=body)
+    image_mode = (
+        ImageRefMode.EMBEDDED
+        if body.options.include_images
+        else ImageRefMode.PLACEHOLDER
+    )
+    doc_resp = DocumentResponse()
+    if body.options.output_docling_document:
+        doc_resp.docling_document = result.document
+    if body.options.output_markdown:
+        doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
+    if body.options.output_html:
+        doc_resp.html = result.document.export_to_html(image_mode=image_mode)
+    return ConvertDocumentResponse(
+        document=doc_resp, status=result.status, timings=result.timings
     )
+@app.post("/convert/markdown", response_class=MarkdownTextResponse)
+def convert_document_md(
+    body: ConvertDocumentRequest,
+) -> MarkdownTextResponse:
+    result = _convert_document(body=body)
+    image_mode = (
+        ImageRefMode.EMBEDDED
+        if body.options.include_images
+        else ImageRefMode.PLACEHOLDER
+    )
+    return MarkdownTextResponse(
+        result.document.export_to_markdown(image_mode=image_mode)
+    )

docling_serve/settings.py CHANGED Viewed

@@ -2,7 +2,5 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
-    do_ocr: bool = True
-    do_table_structure: bool = True
     model_config = SettingsConfigDict(env_prefix="DOCLING_")


2
3
4	class Settings(BaseSettings):


5
6	model_config = SettingsConfigDict(env_prefix="DOCLING_")

poetry.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -30,12 +30,26 @@ classifiers = [
 ]
 [tool.poetry.dependencies]
-python = "^3.10"
-docling = "^1.11.0"
-fastapi = {version = "^0.110.2", extras = ["standard"]}
-uvicorn = "^0.30.6"
 pydantic-settings = "^2.4.0"
-httpx = "^0.27.2"
 [tool.poetry.group.pypi-torch]
 optional = false
@@ -63,6 +77,12 @@ torchvision = [
     {markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
 ]
 [tool.poetry.group.dev.dependencies]
 black = "^24.8.0"
 isort = "^5.13.2"
@@ -93,8 +113,17 @@ remove-unused-variables = true
 expand-star-imports = true
 recursive = true
 [[tool.mypy.overrides]]
 module = [
-    "docling.*",
 ]
 ignore_missing_imports = true

 ]
 [tool.poetry.dependencies]
+python = "^3.9"
+docling = "^2.10.0"
+fastapi = {version = "^0.115.6", extras = ["standard"]}
+uvicorn = "^0.32.1"
 pydantic-settings = "^2.4.0"
+httpx = "^0.28.1"
+tesserocr = { version = "^2.7.1", optional = true }
+rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
+onnxruntime = [
+  # 1.19.2 is the last version with python3.9 support,
+  # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
+  { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
+  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
+]
+[tool.poetry.extras]
+tesserocr = ["tesserocr"]
+rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
 [tool.poetry.group.pypi-torch]
 optional = false
     {markers = 'platform_machine=="x86_64" and sys_platform=="linux" and python_version == "3.12"', url="https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp312-cp312-linux_x86_64.whl"},
 ]
+[tool.poetry.group.constraints.dependencies]
+numpy = [
+    { version = "^2.1.0", markers = 'python_version >= "3.13"' },
+    { version = "^1.24.4", markers = 'python_version < "3.13"' },
+]
 [tool.poetry.group.dev.dependencies]
 black = "^24.8.0"
 isort = "^5.13.2"
 expand-star-imports = true
 recursive = true
+[tool.mypy]
+pretty = true
+# strict = true
+no_implicit_optional = true
+plugins = "pydantic.mypy"
+python_version = "3.10"
 [[tool.mypy.overrides]]
 module = [
+    "easyocr.*",
+    "tesserocr.*",
+    "rapidocr_onnxruntime.*",
 ]
 ignore_missing_imports = true