Spaces:
Configuration error
Configuration error
import hashlib | |
import json | |
import logging | |
from collections.abc import Iterable, Iterator | |
from pathlib import Path | |
from typing import Any, Optional, Union | |
from fastapi import HTTPException | |
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend | |
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend | |
from docling.backend.pdf_backend import PdfDocumentBackend | |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend | |
from docling.datamodel.base_models import DocumentStream, InputFormat | |
from docling.datamodel.document import ConversionResult | |
from docling.datamodel.pipeline_options import ( | |
EasyOcrOptions, | |
OcrEngine, | |
OcrOptions, | |
PdfBackend, | |
PdfPipelineOptions, | |
RapidOcrOptions, | |
TableFormerMode, | |
TesseractOcrOptions, | |
) | |
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption | |
from docling_core.types.doc import ImageRefMode | |
from docling_serve.datamodel.convert import ConvertDocumentsOptions | |
from docling_serve.helper_functions import _to_list_of_strings | |
from docling_serve.settings import docling_serve_settings | |
_log = logging.getLogger(__name__) | |
# Document converters will be preloaded and stored in a dictionary | |
converters: dict[bytes, DocumentConverter] = {} | |
# Custom serializer for PdfFormatOption | |
# (model_dump_json does not work with some classes) | |
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str: | |
data = pdf_format_option.model_dump() | |
# pipeline_options are not fully serialized by model_dump, dedicated pass | |
if pdf_format_option.pipeline_options: | |
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump() | |
# Replace `artifacts_path` with a string representation | |
data["pipeline_options"]["artifacts_path"] = repr( | |
data["pipeline_options"]["artifacts_path"] | |
) | |
# Replace `pipeline_cls` with a string representation | |
data["pipeline_cls"] = repr(data["pipeline_cls"]) | |
# Replace `backend` with a string representation | |
data["backend"] = repr(data["backend"]) | |
# Handle `device` in `accelerator_options` | |
if "accelerator_options" in data and "device" in data["accelerator_options"]: | |
data["accelerator_options"]["device"] = repr( | |
data["accelerator_options"]["device"] | |
) | |
# Serialize the dictionary to JSON with sorted keys to have consistent hashes | |
return json.dumps(data, sort_keys=True) | |
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash | |
def get_pdf_pipeline_opts( # noqa: C901 | |
request: ConvertDocumentsOptions, | |
) -> tuple[PdfFormatOption, bytes]: | |
if request.ocr_engine == OcrEngine.EASYOCR: | |
try: | |
import easyocr # noqa: F401 | |
except ImportError: | |
raise HTTPException( | |
status_code=400, | |
detail="The requested OCR engine" | |
f" (ocr_engine={request.ocr_engine.value})" | |
" is not available on this system. Please choose another OCR engine " | |
"or contact your system administrator.", | |
) | |
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=request.force_ocr) | |
elif request.ocr_engine == OcrEngine.TESSERACT: | |
try: | |
import tesserocr # noqa: F401 | |
except ImportError: | |
raise HTTPException( | |
status_code=400, | |
detail="The requested OCR engine" | |
f" (ocr_engine={request.ocr_engine.value})" | |
" is not available on this system. Please choose another OCR engine " | |
"or contact your system administrator.", | |
) | |
ocr_options = TesseractOcrOptions(force_full_page_ocr=request.force_ocr) | |
elif request.ocr_engine == OcrEngine.RAPIDOCR: | |
try: | |
from rapidocr_onnxruntime import RapidOCR # noqa: F401 | |
except ImportError: | |
raise HTTPException( | |
status_code=400, | |
detail="The requested OCR engine" | |
f" (ocr_engine={request.ocr_engine.value})" | |
" is not available on this system. Please choose another OCR engine " | |
"or contact your system administrator.", | |
) | |
ocr_options = RapidOcrOptions(force_full_page_ocr=request.force_ocr) | |
else: | |
raise RuntimeError(f"Unexpected OCR engine type {request.ocr_engine}") | |
if request.ocr_lang is not None: | |
if isinstance(request.ocr_lang, str): | |
ocr_options.lang = _to_list_of_strings(request.ocr_lang) | |
else: | |
ocr_options.lang = request.ocr_lang | |
pipeline_options = PdfPipelineOptions( | |
do_ocr=request.do_ocr, | |
ocr_options=ocr_options, | |
do_table_structure=request.do_table_structure, | |
do_code_enrichment=request.do_code_enrichment, | |
do_formula_enrichment=request.do_formula_enrichment, | |
do_picture_classification=request.do_picture_classification, | |
do_picture_description=request.do_picture_description, | |
) | |
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching | |
pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode) | |
if request.image_export_mode != ImageRefMode.PLACEHOLDER: | |
pipeline_options.generate_page_images = True | |
if request.images_scale: | |
pipeline_options.images_scale = request.images_scale | |
if request.pdf_backend == PdfBackend.DLPARSE_V1: | |
backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend | |
elif request.pdf_backend == PdfBackend.DLPARSE_V2: | |
backend = DoclingParseV2DocumentBackend | |
elif request.pdf_backend == PdfBackend.PYPDFIUM2: | |
backend = PyPdfiumDocumentBackend | |
else: | |
raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}") | |
if docling_serve_settings.artifacts_path is not None: | |
if str(docling_serve_settings.artifacts_path.absolute()) == "": | |
_log.info( | |
"artifacts_path is an empty path, model weights will be dowloaded " | |
"at runtime." | |
) | |
pipeline_options.artifacts_path = None | |
elif docling_serve_settings.artifacts_path.is_dir(): | |
_log.info( | |
"artifacts_path is set to a valid directory. " | |
"No model weights will be downloaded at runtime." | |
) | |
pipeline_options.artifacts_path = docling_serve_settings.artifacts_path | |
else: | |
_log.warning( | |
"artifacts_path is set to an invalid directory. " | |
"The system will download the model weights at runtime." | |
) | |
pipeline_options.artifacts_path = None | |
else: | |
_log.info( | |
"artifacts_path is unset. " | |
"The system will download the model weights at runtime." | |
) | |
pdf_format_option = PdfFormatOption( | |
pipeline_options=pipeline_options, | |
backend=backend, | |
) | |
serialized_data = _serialize_pdf_format_option(pdf_format_option) | |
options_hash = hashlib.sha1(serialized_data.encode()).digest() | |
return pdf_format_option, options_hash | |
def convert_documents( | |
sources: Iterable[Union[Path, str, DocumentStream]], | |
options: ConvertDocumentsOptions, | |
headers: Optional[dict[str, Any]] = None, | |
): | |
pdf_format_option, options_hash = get_pdf_pipeline_opts(options) | |
if options_hash not in converters: | |
format_options: dict[InputFormat, FormatOption] = { | |
InputFormat.PDF: pdf_format_option, | |
InputFormat.IMAGE: pdf_format_option, | |
} | |
converters[options_hash] = DocumentConverter(format_options=format_options) | |
_log.info(f"We now have {len(converters)} converters in memory.") | |
results: Iterator[ConversionResult] = converters[options_hash].convert_all( | |
sources, | |
headers=headers, | |
) | |
return results | |