Spaces:
Configuration error
Configuration error
File size: 2,536 Bytes
44657b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import base64
from contextlib import asynccontextmanager
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Union
import httpx
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from docling_serve.settings import Settings
class HttpSource(BaseModel):
url: str
headers: Dict[str, Any] = {}
class FileSource(BaseModel):
base64_string: str
filename: str
class ConvertDocumentHttpSourceRequest(BaseModel):
http_source: HttpSource
class ConvertDocumentFileSourceRequest(BaseModel):
file_source: FileSource
class ConvertDocumentResponse(BaseModel):
content_md: str
ConvertDocumentRequest = Union[
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
]
models = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
# Converter
settings = Settings()
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = settings.do_ocr
pipeline_options.do_table_structure = settings.do_table_structure
models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
yield
models.clear()
app = FastAPI(
title="Docling Serve",
lifespan=lifespan,
)
@app.post("/convert")
def convert_pdf_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:
filename: str
buf: BytesIO
if isinstance(body, ConvertDocumentFileSourceRequest):
buf = BytesIO(base64.b64decode(body.file_source.base64_string))
filename = body.file_source.filename
elif isinstance(body, ConvertDocumentHttpSourceRequest):
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
buf = BytesIO(http_res.content)
filename = Path(
body.http_source.url
).name # TODO: use better way to detect filename, e.g. from Content-Disposition
docs_input = DocumentConversionInput.from_streams(
[DocumentStream(filename=filename, stream=buf)]
)
result: ConversionResult = next(models["converter"].convert(docs_input), None)
if result is None or result.status != ConversionStatus.SUCCESS:
raise HTTPException(status_code=500, detail={"errors": result.errors})
return ConvertDocumentResponse(content_md=result.render_as_markdown())
|