File size: 2,140 Bytes
ceaf2e8
 
 
 
 
 
 
 
c6954d9
d8475f5
 
 
ceaf2e8
 
 
 
c6954d9
 
ceaf2e8
 
 
 
 
 
 
 
 
f6cfeb8
 
ceaf2e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a4c6fc
 
 
 
 
 
 
 
 
 
ceaf2e8
 
 
 
 
 
 
 
c6954d9
ceaf2e8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from typing import List
from smoldocling import cli
import shutil
import dotenv

os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/transformers"
os.environ["HF_HUB_CACHE"] = "/app/.cache/hub"

# Load environment variables
dotenv.load_dotenv()

# Initialize FastAPI app
app = FastAPI()

# Enable CORS (optional, but good for dev/testing)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# Ensure directories exist
UPLOAD_DIR = "/tmp/uploads"
OUTPUT_DIR = "/tmp/output"
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)


def docling_process_files(file_list: List[str]) -> str:
    cli.process_files(file_list, OUTPUT_DIR, output_format='json')

    file_path = file_list[0].replace('\\', '/')
    file_name = os.path.splitext(os.path.basename(file_path))[0]

    json_output = os.path.join(OUTPUT_DIR, f"{file_name}.json")
    overlay_html = os.path.join(OUTPUT_DIR, f"{file_name}_overlay.html")

    # Generate overlay (optional)
    cli.generate_docling_overlay(file_path, json_output, overlay_html)

    # Stitch final cleaned text (you can toggle GPT fixing)
    cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False)
    return cleaned_text


@app.get("/")
def root():
    return JSONResponse(content={"message": "Root is working"})


@app.get("/health")
def health_check():
    return JSONResponse(content={"status": "ok"})


@app.post("/parse")
async def parse_docling(file: UploadFile = File(...)):
    if not file:
        raise HTTPException(status_code=400, detail="No file uploaded.")

    save_path = os.path.join(UPLOAD_DIR, file.filename)
    with open(save_path, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)

    try:
        text_output = docling_process_files([save_path])
        return JSONResponse(content={"text": text_output})
    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})