dolfim-ibm's picture
docling_serve
44657b5
raw
history blame
2.54 kB
import base64
from contextlib import asynccontextmanager
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Union
import httpx
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from docling_serve.settings import Settings
class HttpSource(BaseModel):
url: str
headers: Dict[str, Any] = {}
class FileSource(BaseModel):
base64_string: str
filename: str
class ConvertDocumentHttpSourceRequest(BaseModel):
http_source: HttpSource
class ConvertDocumentFileSourceRequest(BaseModel):
file_source: FileSource
class ConvertDocumentResponse(BaseModel):
content_md: str
ConvertDocumentRequest = Union[
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
]
models = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
# Converter
settings = Settings()
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = settings.do_ocr
pipeline_options.do_table_structure = settings.do_table_structure
models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
yield
models.clear()
app = FastAPI(
title="Docling Serve",
lifespan=lifespan,
)
@app.post("/convert")
def convert_pdf_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:
filename: str
buf: BytesIO
if isinstance(body, ConvertDocumentFileSourceRequest):
buf = BytesIO(base64.b64decode(body.file_source.base64_string))
filename = body.file_source.filename
elif isinstance(body, ConvertDocumentHttpSourceRequest):
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
buf = BytesIO(http_res.content)
filename = Path(
body.http_source.url
).name # TODO: use better way to detect filename, e.g. from Content-Disposition
docs_input = DocumentConversionInput.from_streams(
[DocumentStream(filename=filename, stream=buf)]
)
result: ConversionResult = next(models["converter"].convert(docs_input), None)
if result is None or result.status != ConversionStatus.SUCCESS:
raise HTTPException(status_code=500, detail={"errors": result.errors})
return ConvertDocumentResponse(content_md=result.render_as_markdown())