Spaces:
Configuration error
Configuration error
import base64 | |
import hashlib | |
from contextlib import asynccontextmanager | |
from enum import Enum | |
from io import BytesIO | |
from typing import Any, Dict, List, Optional, Tuple, Union | |
import httpx | |
from docling.datamodel.base_models import ( | |
ConversionStatus, | |
DocumentStream, | |
ErrorItem, | |
InputFormat, | |
) | |
from docling.datamodel.document import ConversionResult | |
from docling.datamodel.pipeline_options import ( | |
EasyOcrOptions, | |
OcrOptions, | |
PdfPipelineOptions, | |
RapidOcrOptions, | |
TesseractOcrOptions, | |
) | |
from docling.document_converter import DocumentConverter, PdfFormatOption | |
from docling.utils.profiling import ProfilingItem | |
from docling_core.types.doc import DoclingDocument, ImageRefMode | |
from docling_core.utils.file import resolve_remote_filename | |
from fastapi import FastAPI, HTTPException, Response | |
from pydantic import AnyHttpUrl, BaseModel | |
# TODO: import enum from Docling, once it is exposed | |
class OcrEngine(str, Enum): | |
EASYOCR = "easyocr" | |
TESSERACT = "tesseract" | |
RAPIDOCR = "rapidocr" | |
class ConvertOptions(BaseModel): | |
output_docling_document: bool = True | |
output_markdown: bool = False | |
output_html: bool = False | |
do_ocr: bool = True | |
ocr_engine: OcrEngine = OcrEngine.EASYOCR | |
ocr_lang: Optional[List[str]] = None | |
force_ocr: bool = False | |
do_table_structure: bool = True | |
include_images: bool = True | |
images_scale: float = 2.0 | |
class DocumentConvertBase(BaseModel): | |
options: ConvertOptions = ConvertOptions() | |
class HttpSource(BaseModel): | |
url: str | |
headers: Dict[str, Any] = {} | |
class FileSource(BaseModel): | |
base64_string: str | |
filename: str | |
class ConvertDocumentHttpSourceRequest(DocumentConvertBase): | |
http_source: HttpSource | |
class ConvertDocumentFileSourceRequest(DocumentConvertBase): | |
file_source: FileSource | |
class DocumentResponse(BaseModel): | |
markdown: Optional[str] = None | |
docling_document: Optional[DoclingDocument] = None | |
html: Optional[str] = None | |
class ConvertDocumentResponse(BaseModel): | |
document: DocumentResponse | |
status: ConversionStatus | |
errors: List[ErrorItem] = [] | |
timings: Dict[str, ProfilingItem] = {} | |
class ConvertDocumentErrorResponse(BaseModel): | |
status: ConversionStatus | |
# errors: List[ErrorItem] = [] | |
ConvertDocumentRequest = Union[ | |
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest | |
] | |
class MarkdownTextResponse(Response): | |
media_type = "text/markdown" | |
class HealthCheckResponse(BaseModel): | |
status: str = "ok" | |
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]: | |
if options.ocr_engine == OcrEngine.EASYOCR: | |
try: | |
import easyocr # noqa: F401 | |
except ImportError: | |
raise HTTPException( | |
status_code=400, | |
detail="The requested OCR engine" | |
f" (ocr_engine={options.ocr_engine.value})" | |
" is not available on this system. Please choose another OCR engine " | |
"or contact your system administrator.", | |
) | |
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr) | |
elif options.ocr_engine == OcrEngine.TESSERACT: | |
try: | |
import tesserocr # noqa: F401 | |
except ImportError: | |
raise HTTPException( | |
status_code=400, | |
detail="The requested OCR engine" | |
f" (ocr_engine={options.ocr_engine.value})" | |
" is not available on this system. Please choose another OCR engine " | |
"or contact your system administrator.", | |
) | |
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr) | |
elif options.ocr_engine == OcrEngine.RAPIDOCR: | |
try: | |
from rapidocr_onnxruntime import RapidOCR # noqa: F401 | |
except ImportError: | |
raise HTTPException( | |
status_code=400, | |
detail="The requested OCR engine" | |
f" (ocr_engine={options.ocr_engine.value})" | |
" is not available on this system. Please choose another OCR engine " | |
"or contact your system administrator.", | |
) | |
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr) | |
else: | |
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}") | |
if options.ocr_lang is not None: | |
ocr_options.lang = options.ocr_lang | |
pipeline_options = PdfPipelineOptions( | |
do_ocr=options.do_ocr, | |
ocr_options=ocr_options, | |
do_table_structure=options.do_table_structure, | |
generate_page_images=options.include_images, | |
generate_picture_images=options.include_images, | |
images_scale=options.images_scale, | |
) | |
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest() | |
return pipeline_options, options_hash | |
converters: Dict[str, DocumentConverter] = {} | |
async def lifespan(app: FastAPI): | |
# settings = Settings() | |
# Converter with default options | |
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions()) | |
converters[options_hash] = DocumentConverter( | |
format_options={ | |
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), | |
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options), | |
} | |
) | |
converters[options_hash].initialize_pipeline(InputFormat.PDF) | |
yield | |
converters.clear() | |
app = FastAPI( | |
title="Docling Serve", | |
lifespan=lifespan, | |
) | |
def health() -> HealthCheckResponse: | |
return HealthCheckResponse() | |
def _convert_document( | |
body: ConvertDocumentRequest, | |
) -> ConversionResult: | |
filename: str | |
buf: BytesIO | |
if isinstance(body, ConvertDocumentFileSourceRequest): | |
buf = BytesIO(base64.b64decode(body.file_source.base64_string)) | |
filename = body.file_source.filename | |
elif isinstance(body, ConvertDocumentHttpSourceRequest): | |
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers) | |
buf = BytesIO(http_res.content) | |
filename = resolve_remote_filename( | |
http_url=AnyHttpUrl(body.http_source.url), | |
response_headers=dict(**http_res.headers), | |
) | |
doc_input = DocumentStream(name=filename, stream=buf) | |
pipeline_options, options_hash = get_pdf_pipeline_opts(body.options) | |
if options_hash not in converters: | |
converters[options_hash] = DocumentConverter( | |
format_options={ | |
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), | |
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options), | |
} | |
) | |
result: ConversionResult = converters[options_hash].convert(doc_input) | |
if result is None or result.status == ConversionStatus.SKIPPED: | |
raise HTTPException(status_code=400, detail=result.errors) | |
if result is None or result.status not in { | |
ConversionStatus.SUCCESS, | |
}: | |
raise HTTPException( | |
status_code=500, detail={"errors": result.errors, "status": result.status} | |
) | |
return result | |
def convert_document( | |
body: ConvertDocumentRequest, | |
) -> ConvertDocumentResponse: | |
result = _convert_document(body=body) | |
image_mode = ( | |
ImageRefMode.EMBEDDED | |
if body.options.include_images | |
else ImageRefMode.PLACEHOLDER | |
) | |
doc_resp = DocumentResponse() | |
if body.options.output_docling_document: | |
doc_resp.docling_document = result.document | |
if body.options.output_markdown: | |
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode) | |
if body.options.output_html: | |
doc_resp.html = result.document.export_to_html(image_mode=image_mode) | |
return ConvertDocumentResponse( | |
document=doc_resp, status=result.status, timings=result.timings | |
) | |
def convert_document_md( | |
body: ConvertDocumentRequest, | |
) -> MarkdownTextResponse: | |
result = _convert_document(body=body) | |
image_mode = ( | |
ImageRefMode.EMBEDDED | |
if body.options.include_images | |
else ImageRefMode.PLACEHOLDER | |
) | |
return MarkdownTextResponse( | |
result.document.export_to_markdown(image_mode=image_mode) | |
) | |