Spaces:
Sleeping
Sleeping
Added Smoldocling Package and implemeted it's first test /parse
Browse files- Dockerfile +10 -10
- app.py +57 -4
- requirements.txt +2 -0
- smoldocling/README.md +54 -0
- smoldocling/__init__.py +5 -0
- smoldocling/cli.py +530 -0
- smoldocling/overlays.py +305 -0
- smoldocling/pyproject.toml +28 -0
- smoldocling/server.py +20 -0
- smoldocling/test_server.py +31 -0
- smoldocling/testrun.py +20 -0
Dockerfile
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
-
#
|
2 |
-
# you will also find guides on how best to write your Dockerfile
|
3 |
|
4 |
-
FROM python:3.
|
5 |
-
|
6 |
-
RUN useradd -m -u 1000 user
|
7 |
-
USER user
|
8 |
-
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
|
10 |
WORKDIR /app
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
COPY --chown=user . /app
|
16 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
1 |
+
# Dockerfile
|
|
|
2 |
|
3 |
+
FROM python:3.10-slim
|
|
|
|
|
|
|
|
|
4 |
|
5 |
WORKDIR /app
|
6 |
|
7 |
+
# Copy everything including smoldocling
|
8 |
+
COPY . .
|
9 |
+
|
10 |
+
# Install requirements (editable install of local package)
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt && \
|
12 |
+
pip install -e ./smoldocling
|
13 |
+
|
14 |
+
EXPOSE 7860
|
15 |
|
|
|
16 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
@@ -1,8 +1,61 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
|
|
|
|
|
|
|
|
3 |
app = FastAPI()
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
3 |
+
from fastapi.responses import JSONResponse
|
4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
from typing import List
|
6 |
+
from smoldocling import cli
|
7 |
+
import shutil
|
8 |
+
import dotenv
|
9 |
|
10 |
+
# Load environment variables
|
11 |
+
dotenv.load_dotenv()
|
12 |
+
|
13 |
+
# Initialize FastAPI app
|
14 |
app = FastAPI()
|
15 |
|
16 |
+
# Enable CORS (optional, but good for dev/testing)
|
17 |
+
app.add_middleware(
|
18 |
+
CORSMiddleware,
|
19 |
+
allow_origins=["*"],
|
20 |
+
allow_methods=["*"],
|
21 |
+
allow_headers=["*"],
|
22 |
+
)
|
23 |
+
|
24 |
+
# Ensure directories exist
|
25 |
+
UPLOAD_DIR = "uploads"
|
26 |
+
OUTPUT_DIR = "output"
|
27 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
28 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
29 |
+
|
30 |
+
|
31 |
+
def docling_process_files(file_list: List[str]) -> str:
|
32 |
+
cli.process_files(file_list, OUTPUT_DIR, output_format='json')
|
33 |
+
|
34 |
+
file_path = file_list[0].replace('\\', '/')
|
35 |
+
file_name = os.path.splitext(os.path.basename(file_path))[0]
|
36 |
+
|
37 |
+
json_output = os.path.join(OUTPUT_DIR, f"{file_name}.json")
|
38 |
+
overlay_html = os.path.join(OUTPUT_DIR, f"{file_name}_overlay.html")
|
39 |
+
|
40 |
+
# Generate overlay (optional)
|
41 |
+
cli.generate_docling_overlay(file_path, json_output, overlay_html)
|
42 |
+
|
43 |
+
# Stitch final cleaned text (you can toggle GPT fixing)
|
44 |
+
cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False)
|
45 |
+
return cleaned_text
|
46 |
+
|
47 |
+
|
48 |
+
@app.post("/parse")
|
49 |
+
async def parse_docling(file: UploadFile = File(...)):
|
50 |
+
if not file:
|
51 |
+
raise HTTPException(status_code=400, detail="No file uploaded.")
|
52 |
+
|
53 |
+
save_path = os.path.join(UPLOAD_DIR, file.filename)
|
54 |
+
with open(save_path, "wb") as buffer:
|
55 |
+
shutil.copyfileobj(file.file, buffer)
|
56 |
|
57 |
+
try:
|
58 |
+
text_output = docling_process_files([save_path])
|
59 |
+
return JSONResponse(content={"text": text_output})
|
60 |
+
except Exception as e:
|
61 |
+
return JSONResponse(status_code=500, content={"error": str(e)})
|
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
fastapi
|
2 |
uvicorn[standard]
|
|
|
|
|
|
1 |
fastapi
|
2 |
uvicorn[standard]
|
3 |
+
python-multipart
|
4 |
+
-e ./smoldocling
|
smoldocling/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Smoldocling CLI
|
2 |
+
|
3 |
+
A command-line interface for processing document images and PDFs using Smoldocling-256-preview model.
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
|
7 |
+
1. Clone this repository
|
8 |
+
2. Install the required dependencies:
|
9 |
+
```bash
|
10 |
+
pip install -r requirements.txt
|
11 |
+
```
|
12 |
+
|
13 |
+
## Usage
|
14 |
+
|
15 |
+
The CLI supports processing one or multiple document images and PDFs at once. The processed output will be saved as HTML files.
|
16 |
+
|
17 |
+
Basic usage:
|
18 |
+
```bash
|
19 |
+
python smoldocling_cli.py input1.png input2.jpg input3.pdf
|
20 |
+
```
|
21 |
+
|
22 |
+
Specify output directory:
|
23 |
+
```bash
|
24 |
+
python smoldocling_cli.py -o custom_output input1.png document.pdf
|
25 |
+
```
|
26 |
+
|
27 |
+
### Arguments
|
28 |
+
|
29 |
+
- `input_files`: One or more input files (images or PDFs) to process
|
30 |
+
- `-o, --output-dir`: Output directory for HTML files (default: 'output')
|
31 |
+
|
32 |
+
### Example
|
33 |
+
|
34 |
+
```bash
|
35 |
+
python smoldocling_cli.py document1.png document2.pdf -o processed_docs
|
36 |
+
```
|
37 |
+
|
38 |
+
This will:
|
39 |
+
1. Process document1.png and generate document1.html
|
40 |
+
2. Process document2.pdf and generate document2.html (containing all pages in a single file)
|
41 |
+
3. Create a directory called 'processed_docs' if it doesn't exist
|
42 |
+
4. Save all HTML files in the processed_docs directory
|
43 |
+
|
44 |
+
## Notes
|
45 |
+
|
46 |
+
- The script will automatically create the output directory if it doesn't exist
|
47 |
+
- Each input image file will generate a corresponding HTML file with the same name (but .html extension)
|
48 |
+
- PDF files will generate a single HTML file containing all pages
|
49 |
+
- Currently, PDF processing is limited to the first 3 pages due to model limitations
|
50 |
+
- Failed processing of one file won't stop the processing of other files
|
51 |
+
- Error messages will be printed to stderr
|
52 |
+
- The model is loaded only once for processing multiple files
|
53 |
+
|
54 |
+
|
smoldocling/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Smoldocling CLI package for processing document images.
|
3 |
+
"""
|
4 |
+
|
5 |
+
__version__ = "0.1.0"
|
smoldocling/cli.py
ADDED
@@ -0,0 +1,530 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
from PIL import Image, ImageDraw
|
7 |
+
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
8 |
+
from docling_core.types.doc.document import DocTagsDocument
|
9 |
+
import torch
|
10 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
11 |
+
from transformers.image_utils import load_image
|
12 |
+
import sys
|
13 |
+
from pdf2image import convert_from_path
|
14 |
+
import tempfile
|
15 |
+
import json
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
from pprint import pprint
|
18 |
+
import base64
|
19 |
+
from dotenv import load_dotenv
|
20 |
+
import openai
|
21 |
+
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
22 |
+
from azure.core.credentials import AzureKeyCredential
|
23 |
+
from smoldocling.overlays import generate_azure_overlay_html, generate_docling_overlay
|
24 |
+
from PIL import Image
|
25 |
+
import requests
|
26 |
+
from io import BytesIO
|
27 |
+
|
28 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
29 |
+
|
30 |
+
load_dotenv()
|
31 |
+
|
32 |
+
|
33 |
+
def load_model(verbose=True):
|
34 |
+
"""Load the Smoldocling model and return model and processor."""
|
35 |
+
if verbose:
|
36 |
+
print("Loading Smoldocling model...")
|
37 |
+
model_path = "ds4sd/SmolDocling-256M-preview"
|
38 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
39 |
+
model = AutoModelForVision2Seq.from_pretrained(
|
40 |
+
model_path,
|
41 |
+
torch_dtype=torch.float16, # Use float16 for T4 GPU
|
42 |
+
).to(DEVICE)
|
43 |
+
return model, processor
|
44 |
+
|
45 |
+
|
46 |
+
def run_model(model, processor, image, prompt="Convert this page to docling.", verbose=True):
|
47 |
+
"""Run the Smoldocling model with the given image and prompt and return the doctags."""
|
48 |
+
# Prepare inputs
|
49 |
+
messages = [
|
50 |
+
{
|
51 |
+
"role": "user",
|
52 |
+
"content": [
|
53 |
+
{"type": "image"},
|
54 |
+
{"type": "text", "text": prompt}
|
55 |
+
]
|
56 |
+
},
|
57 |
+
]
|
58 |
+
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
59 |
+
inputs = processor(
|
60 |
+
text=formatted_prompt,
|
61 |
+
images=[image],
|
62 |
+
return_tensors="pt",
|
63 |
+
truncation=True, # ✅ Avoid truncation warning
|
64 |
+
).to(DEVICE)
|
65 |
+
|
66 |
+
# Generate output
|
67 |
+
if verbose:
|
68 |
+
print("Generating text...")
|
69 |
+
generated_ids = model.generate(**inputs, max_new_tokens=8192)
|
70 |
+
prompt_length = inputs.input_ids.shape[1]
|
71 |
+
trimmed_generated_ids = generated_ids[:, prompt_length:]
|
72 |
+
return processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
|
73 |
+
|
74 |
+
|
75 |
+
def extract_text_from_document(image_path, model, processor, output_format="html", verbose=True):
|
76 |
+
"""Extract text from a document image using Smoldocling-256."""
|
77 |
+
try:
|
78 |
+
# Load and preprocess the image
|
79 |
+
image = Image.open(image_path)
|
80 |
+
|
81 |
+
if verbose:
|
82 |
+
print(f"Processing {image_path}")
|
83 |
+
print(f"Image mode: {image.mode}")
|
84 |
+
print(f"Image size: {image.size}")
|
85 |
+
|
86 |
+
# Run docling vlm
|
87 |
+
output = run_model(model, processor, image, verbose=verbose)
|
88 |
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
89 |
+
[output],
|
90 |
+
[image]
|
91 |
+
)
|
92 |
+
doc = DoclingDocument(name=Path(image_path).stem).load_from_doctags(doctags_doc)
|
93 |
+
|
94 |
+
# Handle formatting and export
|
95 |
+
if output_format == "json":
|
96 |
+
# Export to dict (no images)
|
97 |
+
doc_dict = doc.export_to_dict()
|
98 |
+
# Remove images from the dict if present
|
99 |
+
if "pictures" in doc_dict:
|
100 |
+
for picture in doc_dict["pictures"]:
|
101 |
+
if "image" in picture:
|
102 |
+
if "uri" in picture["image"]:
|
103 |
+
del picture["image"]["uri"]
|
104 |
+
return doc_dict
|
105 |
+
else:
|
106 |
+
html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
|
107 |
+
return html_output
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
if verbose:
|
111 |
+
print(f"Error processing 1: {image_path}: {str(e)}", file=sys.stderr)
|
112 |
+
return None
|
113 |
+
|
114 |
+
|
115 |
+
def process_pdf(pdf_path, model, processor, output_dir, output_format="html", debug=False, verbose=True):
|
116 |
+
"""Process a PDF file by converting it to images and processing each page."""
|
117 |
+
try:
|
118 |
+
if verbose:
|
119 |
+
print(f"\nProcessing PDF: {pdf_path}")
|
120 |
+
# Convert PDF to images
|
121 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
122 |
+
if verbose:
|
123 |
+
print("Converting PDF to images...")
|
124 |
+
# TODO: Review this. It's not working when the PDF is large.
|
125 |
+
images = convert_from_path(
|
126 |
+
pdf_path,
|
127 |
+
output_folder=temp_dir,
|
128 |
+
first_page=1,
|
129 |
+
fmt="png"
|
130 |
+
)
|
131 |
+
if not images:
|
132 |
+
if verbose:
|
133 |
+
print(f"No pages found in PDF: {pdf_path}", file=sys.stderr)
|
134 |
+
return
|
135 |
+
all_doctags = []
|
136 |
+
all_images = []
|
137 |
+
for i, image in enumerate(images, start=1):
|
138 |
+
image_path = os.path.join(temp_dir, f"page_{i}.png")
|
139 |
+
image.save(image_path, "PNG")
|
140 |
+
if verbose:
|
141 |
+
print(f"\nProcessing page {i}")
|
142 |
+
try:
|
143 |
+
image = Image.open(image_path)
|
144 |
+
if verbose:
|
145 |
+
print(f"Processing {image_path}")
|
146 |
+
print(f"Image mode: {image.mode}")
|
147 |
+
print(f"Image size: {image.size}")
|
148 |
+
output = run_model(model, processor, image, verbose=verbose)
|
149 |
+
cleaned_output = output.replace("<end_of_utterance>", "").strip()
|
150 |
+
# If you have charts:
|
151 |
+
if "<chart>" in cleaned_output:
|
152 |
+
cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
|
153 |
+
all_doctags.append(cleaned_output)
|
154 |
+
all_images.append(image)
|
155 |
+
if verbose:
|
156 |
+
print(f"Successfully processed page {i}")
|
157 |
+
# DEBUG: Dump per-page JSON if requested
|
158 |
+
if debug and output_dir is not None:
|
159 |
+
# Create a single-page DocTagsDocument and DoclingDocument
|
160 |
+
doctags_doc_page = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], [image])
|
161 |
+
doc_page = DoclingDocument(name=f"{Path(pdf_path).stem}_p{i}")
|
162 |
+
doc_page.load_from_doctags(doctags_doc_page)
|
163 |
+
doc_dict_page = doc_page.export_to_dict()
|
164 |
+
# Remove images from the dict if present
|
165 |
+
if "pages" in doc_dict_page:
|
166 |
+
for page in doc_dict_page["pages"]:
|
167 |
+
if "image" in page:
|
168 |
+
page["image"] = None
|
169 |
+
page_json_path = Path(output_dir) / f"{Path(pdf_path).stem}_p{i}.json"
|
170 |
+
with open(page_json_path, 'w', encoding='utf-8') as f:
|
171 |
+
json.dump(doc_dict_page, f, ensure_ascii=False, indent=2)
|
172 |
+
if verbose:
|
173 |
+
print(f"[DEBUG] Dumped page {i} JSON to {page_json_path}")
|
174 |
+
except Exception as e:
|
175 |
+
if verbose:
|
176 |
+
print(f"Error processing page {i}: {str(e)}", file=sys.stderr)
|
177 |
+
if all_doctags and all_images:
|
178 |
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
179 |
+
all_doctags,
|
180 |
+
all_images
|
181 |
+
)
|
182 |
+
doc = DoclingDocument(name=Path(pdf_path).stem)
|
183 |
+
doc.load_from_doctags(doctags_doc)
|
184 |
+
if output_format == "json":
|
185 |
+
doc_dict = doc.export_to_dict()
|
186 |
+
if "pages" in doc_dict:
|
187 |
+
for page in doc_dict["pages"]:
|
188 |
+
if "image" in page:
|
189 |
+
page["image"] = None
|
190 |
+
if output_dir is None:
|
191 |
+
return doc_dict
|
192 |
+
output_filename = f"{Path(pdf_path).stem}.json"
|
193 |
+
output_path = Path(output_dir) / output_filename
|
194 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
195 |
+
json.dump(doc_dict, f, ensure_ascii=False, indent=2)
|
196 |
+
if verbose:
|
197 |
+
print(f"\nSuccessfully saved combined output to {output_path}")
|
198 |
+
else:
|
199 |
+
html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
|
200 |
+
if output_dir is None:
|
201 |
+
return html_output
|
202 |
+
output_filename = f"{Path(pdf_path).stem}.html"
|
203 |
+
output_path = Path(output_dir) / output_filename
|
204 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
205 |
+
f.write(html_output)
|
206 |
+
if verbose:
|
207 |
+
print(f"\nSuccessfully saved combined output to {output_path}")
|
208 |
+
else:
|
209 |
+
if verbose:
|
210 |
+
print("No pages were successfully processed", file=sys.stderr)
|
211 |
+
except Exception as e:
|
212 |
+
if verbose:
|
213 |
+
print(f"Error processing PDF {pdf_path}: {str(e)}", file=sys.stderr)
|
214 |
+
|
215 |
+
|
216 |
+
def process_files(input_files, output_dir, output_format="html", debug=False, verbose=True):
|
217 |
+
"""Process multiple input files and generate outputs in the specified format."""
|
218 |
+
if output_dir is not None:
|
219 |
+
os.makedirs(output_dir, exist_ok=True)
|
220 |
+
|
221 |
+
model, processor = load_model(verbose=verbose)
|
222 |
+
results = []
|
223 |
+
|
224 |
+
for input_file in input_files:
|
225 |
+
try:
|
226 |
+
input_path = Path(input_file)
|
227 |
+
if input_path.suffix.lower() == '.pdf':
|
228 |
+
if output_dir is None:
|
229 |
+
# Collect results instead of writing to files
|
230 |
+
pdf_result = process_pdf(input_file, model, processor, None, output_format=output_format, debug=debug, verbose=verbose)
|
231 |
+
if pdf_result:
|
232 |
+
results.extend(pdf_result)
|
233 |
+
else:
|
234 |
+
process_pdf(input_file, model, processor, output_dir, output_format=output_format, debug=debug, verbose=verbose)
|
235 |
+
else:
|
236 |
+
if verbose:
|
237 |
+
print(f"\nProcessing: {input_file}")
|
238 |
+
|
239 |
+
doc_dict = extract_text_from_document(input_path, model, processor, output_format=output_format, verbose=verbose)
|
240 |
+
if doc_dict:
|
241 |
+
if output_dir is None:
|
242 |
+
results.append(doc_dict)
|
243 |
+
else:
|
244 |
+
output_path = Path(output_dir) / f"{input_path.stem}.{output_format}"
|
245 |
+
if verbose:
|
246 |
+
print(f"Output will be saved to: {output_path}")
|
247 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
248 |
+
if output_format == "json":
|
249 |
+
json.dump(doc_dict, f, ensure_ascii=False, indent=2)
|
250 |
+
elif output_format == "html":
|
251 |
+
f.write(doc_dict)
|
252 |
+
if verbose:
|
253 |
+
print(f"Successfully processed {input_file}")
|
254 |
+
else:
|
255 |
+
if verbose:
|
256 |
+
print(f"Failed to process {input_file}", file=sys.stderr)
|
257 |
+
|
258 |
+
except Exception as e:
|
259 |
+
if verbose:
|
260 |
+
print(f"Error processing 2 {input_file}: {str(e)}", file=sys.stderr)
|
261 |
+
|
262 |
+
if output_dir is None:
|
263 |
+
return results
|
264 |
+
|
265 |
+
|
266 |
+
def visualize_doc(doc_path, page_num=0):
|
267 |
+
"""
|
268 |
+
Visualize a document (PDF or image) with bounding boxes from its corresponding JSON annotation.
|
269 |
+
|
270 |
+
Args:
|
271 |
+
doc_path (str): Path to the input document file (PDF or image)
|
272 |
+
page_num (int): Page number to visualize for PDFs (default 0)
|
273 |
+
"""
|
274 |
+
# Load document
|
275 |
+
if doc_path.lower().endswith('.pdf'):
|
276 |
+
# Handle PDF with pdf2image
|
277 |
+
# pdf_doc = fitz.open(doc_path)
|
278 |
+
# page = pdf_doc[page_num]
|
279 |
+
# pix = page.get_pixmap()
|
280 |
+
# image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
281 |
+
images = convert_from_path(doc_path, first_page=1)
|
282 |
+
image = images[page_num]
|
283 |
+
else:
|
284 |
+
# Handle image
|
285 |
+
image = Image.open(doc_path).convert("RGB")
|
286 |
+
|
287 |
+
# Load corresponding JSON
|
288 |
+
json_path = doc_path.replace("input", "output").replace(".png", ".json")
|
289 |
+
if doc_path.lower().endswith('.pdf'):
|
290 |
+
# For PDFs, append page number to JSON filename
|
291 |
+
json_path = json_path.replace(".pdf", f"_p{page_num+1}.json")
|
292 |
+
with open(json_path, "r") as f:
|
293 |
+
doc = json.load(f)
|
294 |
+
|
295 |
+
# Collect all bounding boxes from texts and pictures
|
296 |
+
bboxes = []
|
297 |
+
labels = []
|
298 |
+
|
299 |
+
for text in doc.get("texts", []):
|
300 |
+
for prov in text.get("prov", []):
|
301 |
+
# Only process boxes from specified page for PDFs
|
302 |
+
# if doc_path.lower().endswith('.pdf') and prov.get("page_no") != page_num + 1:
|
303 |
+
if doc_path.lower().endswith('.pdf') and prov.get("page_no") != 1: # currently only works for first page
|
304 |
+
continue
|
305 |
+
|
306 |
+
bbox = prov.get("bbox")
|
307 |
+
if bbox:
|
308 |
+
bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
|
309 |
+
labels.append(text.get("label", ""))
|
310 |
+
|
311 |
+
for pic in doc.get("pictures", []):
|
312 |
+
for prov in pic.get("prov", []):
|
313 |
+
# Only process boxes from specified page for PDFs
|
314 |
+
# if doc_path.lower().endswith('.pdf') and prov.get("page_no") != page_num + 1:
|
315 |
+
if doc_path.lower().endswith('.pdf') and prov.get("page_no") != 1: # currently only works for first page
|
316 |
+
continue
|
317 |
+
|
318 |
+
bbox = prov.get("bbox")
|
319 |
+
if bbox:
|
320 |
+
bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
|
321 |
+
labels.append(pic.get("label", "picture"))
|
322 |
+
|
323 |
+
for table in doc.get("tables", []):
|
324 |
+
for prov in table.get("prov", []):
|
325 |
+
bbox = prov.get("bbox")
|
326 |
+
if bbox:
|
327 |
+
bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
|
328 |
+
labels.append(table.get("label", ""))
|
329 |
+
|
330 |
+
# Draw bounding boxes
|
331 |
+
draw = ImageDraw.Draw(image)
|
332 |
+
for (l, t, r, b), label in zip(bboxes, labels):
|
333 |
+
draw.rectangle([l, t, r, b], outline="red", width=2)
|
334 |
+
if label:
|
335 |
+
draw.text((l, t-10), f"{label} ({l:.1f}, {t:.1f}, {r:.1f}, {b:.1f})", fill="red")
|
336 |
+
|
337 |
+
# Display
|
338 |
+
plt.figure(figsize=(10, 12))
|
339 |
+
plt.imshow(image)
|
340 |
+
plt.axis("off")
|
341 |
+
plt.show()
|
342 |
+
|
343 |
+
|
344 |
+
def stitch_text_from_json(json_path, gpt_fix=False):
|
345 |
+
"""
|
346 |
+
Given a JSON file in the DoclingDocument format, stitch together all text fragments in the order specified in the body and group sections.
|
347 |
+
Print the result as plain text. Optionally send to GPT to fix line breaks and hyphenation.
|
348 |
+
Returns the stitched (and optionally cleaned) text as a string.
|
349 |
+
"""
|
350 |
+
stitched_text = None
|
351 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
352 |
+
doc = json.load(f)
|
353 |
+
|
354 |
+
texts = doc.get('texts', [])
|
355 |
+
groups = doc.get('groups', [])
|
356 |
+
body = doc.get('body', {})
|
357 |
+
|
358 |
+
# Build lookup tables
|
359 |
+
texts_by_ref = {f"#/texts/{i}": t for i, t in enumerate(texts)}
|
360 |
+
groups_by_ref = {g['self_ref']: g for g in groups}
|
361 |
+
|
362 |
+
def extract_texts(children):
|
363 |
+
result = []
|
364 |
+
for child in children:
|
365 |
+
ref = child.get('$ref')
|
366 |
+
if ref is None:
|
367 |
+
continue
|
368 |
+
if ref.startswith('#/texts/'):
|
369 |
+
text_obj = texts_by_ref.get(ref)
|
370 |
+
if text_obj:
|
371 |
+
text = text_obj.get('text', '')
|
372 |
+
if text:
|
373 |
+
result.append(text)
|
374 |
+
elif ref.startswith('#/groups/'):
|
375 |
+
group_obj = groups_by_ref.get(ref)
|
376 |
+
if group_obj:
|
377 |
+
result.extend(extract_texts(group_obj.get('children', [])))
|
378 |
+
return result
|
379 |
+
|
380 |
+
stitched_texts = extract_texts(body.get('children', []))
|
381 |
+
final_text = '\n'.join(stitched_texts)
|
382 |
+
|
383 |
+
if gpt_fix:
|
384 |
+
try:
|
385 |
+
api_key = os.environ.get('OPENAI_API_KEY')
|
386 |
+
if not api_key:
|
387 |
+
print("OPENAI_API_KEY not set. Printing original stitched text.", file=sys.stderr)
|
388 |
+
print(final_text)
|
389 |
+
return final_text
|
390 |
+
client = openai.OpenAI(api_key=api_key)
|
391 |
+
prompt = (
|
392 |
+
"You are a helpful assistant. "
|
393 |
+
"The following text was extracted from a document and may contain odd line breaks, hyphenated words split across lines, or other OCR artifacts. "
|
394 |
+
"Please rewrite the text as clean, readable prose, fixing line breaks, joining hyphenated words, and correcting obvious errors, but do not add or remove content.\n\n"
|
395 |
+
f"Text to fix:\n\n{final_text}\n\nCleaned text:"
|
396 |
+
)
|
397 |
+
response = client.chat.completions.create(
|
398 |
+
model="gpt-4o-mini",
|
399 |
+
messages=[{"role": "user", "content": prompt}],
|
400 |
+
max_tokens=4096,
|
401 |
+
temperature=0.0,
|
402 |
+
)
|
403 |
+
cleaned_text = response.choices[0].message.content.strip()
|
404 |
+
print(cleaned_text)
|
405 |
+
return cleaned_text
|
406 |
+
except Exception as e:
|
407 |
+
print(f"[GPT-fix error] {e}. Printing original stitched text.", file=sys.stderr)
|
408 |
+
print(final_text)
|
409 |
+
return final_text
|
410 |
+
else:
|
411 |
+
print(final_text)
|
412 |
+
return final_text
|
413 |
+
|
414 |
+
|
415 |
+
def extract_with_azure(input_files, output_dir, output_format="json", verbose=True):
|
416 |
+
endpoint = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
|
417 |
+
key = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_KEY")
|
418 |
+
if not endpoint or not key:
|
419 |
+
print("Azure endpoint/key not set. Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and AZURE_DOCUMENT_INTELLIGENCE_KEY in your environment.", file=sys.stderr)
|
420 |
+
return
|
421 |
+
|
422 |
+
client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))
|
423 |
+
os.makedirs(output_dir, exist_ok=True)
|
424 |
+
|
425 |
+
for input_file in input_files:
|
426 |
+
with open(input_file, "rb") as f:
|
427 |
+
file_bytes = f.read()
|
428 |
+
poller = client.begin_analyze_document(
|
429 |
+
model_id="prebuilt-layout",
|
430 |
+
body={"base64Source": base64.b64encode(file_bytes).decode("utf-8")}
|
431 |
+
)
|
432 |
+
result = poller.result()
|
433 |
+
output_path = Path(output_dir) / (Path(input_file).stem + ".json")
|
434 |
+
with open(output_path, "w", encoding="utf-8") as out_f:
|
435 |
+
json.dump(result.as_dict(), out_f, ensure_ascii=False, indent=2)
|
436 |
+
if verbose:
|
437 |
+
print(f"Azure baseline output saved to {output_path}")
|
438 |
+
|
439 |
+
|
440 |
+
def main():
|
441 |
+
parser = argparse.ArgumentParser(
|
442 |
+
description="Process document images and PDFs using Smoldocling and generate HTML or JSON outputs"
|
443 |
+
)
|
444 |
+
subparsers = parser.add_subparsers(dest="command", required=False)
|
445 |
+
|
446 |
+
# Default parser for main processing
|
447 |
+
parser_main = subparsers.add_parser("process", help="Process images or PDFs to HTML/JSON (default)")
|
448 |
+
parser_main.add_argument(
|
449 |
+
'input_files', nargs='+', help='One or more input files (images or PDFs) to process'
|
450 |
+
)
|
451 |
+
parser_main.add_argument(
|
452 |
+
'-o', '--output-dir', default='output', help='Output directory for result files (default: output)'
|
453 |
+
)
|
454 |
+
parser_main.add_argument(
|
455 |
+
'--format', choices=['html', 'json'], default='html', help='Output format: html or json (default: html)'
|
456 |
+
)
|
457 |
+
parser_main.add_argument(
|
458 |
+
'--debug', action='store_true', help='Enable debug mode: dump each PDF page as a separate JSON file.'
|
459 |
+
)
|
460 |
+
|
461 |
+
# Overlay HTML subcommand
|
462 |
+
parser_overlay = subparsers.add_parser("overlay-html", help="Generate HTML overlay from PNG and JSON")
|
463 |
+
parser_overlay.add_argument('image_file', help='Source PNG image file')
|
464 |
+
parser_overlay.add_argument('json_file', help='Extracted JSON file with bounding boxes')
|
465 |
+
parser_overlay.add_argument('-o', '--output', help='Output HTML file (default: <image_file>_overlay.html)')
|
466 |
+
|
467 |
+
# Stitch text subcommand
|
468 |
+
parser_stitch = subparsers.add_parser("stitch-text", help="Stitch together text fragments from a JSON file and print as plain text")
|
469 |
+
parser_stitch.add_argument('json_file', help='Extracted JSON file to stitch')
|
470 |
+
parser_stitch.add_argument('--gpt-fix', action='store_true', help='Send stitched text to GPT to fix line breaks and hyphenation')
|
471 |
+
|
472 |
+
# Azure baseline subcommand
|
473 |
+
parser_azure = subparsers.add_parser(
|
474 |
+
"azure-baseline", help="Extract content using Azure Document Intelligence for baseline comparison"
|
475 |
+
)
|
476 |
+
parser_azure.add_argument(
|
477 |
+
'input_files', nargs='+', help='One or more input files (images or PDFs) to process with Azure Document Intelligence'
|
478 |
+
)
|
479 |
+
parser_azure.add_argument(
|
480 |
+
'-o', '--output-dir', default='output_azure', help='Output directory for Azure baseline result files (default: output_azure)'
|
481 |
+
)
|
482 |
+
parser_azure.add_argument(
|
483 |
+
'--format', choices=['json'], default='json', help='Output format: json (default: json)'
|
484 |
+
)
|
485 |
+
|
486 |
+
# Azure overlay HTML subcommand
|
487 |
+
parser_azure_overlay = subparsers.add_parser("azure-overlay-html", help="Generate HTML overlay for Azure Document Intelligence output (words)")
|
488 |
+
parser_azure_overlay.add_argument('--image', required=True, help='Path to scanned image file')
|
489 |
+
parser_azure_overlay.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
|
490 |
+
parser_azure_overlay.add_argument('--output', required=True, help='Path to output HTML file')
|
491 |
+
|
492 |
+
args = parser.parse_args()
|
493 |
+
|
494 |
+
if args.command == "overlay-html":
|
495 |
+
output_html = args.output or (os.path.splitext(args.image_file)[0] + "_overlay.html")
|
496 |
+
generate_docling_overlay(args.image_file, args.json_file, output_html)
|
497 |
+
return
|
498 |
+
|
499 |
+
if args.command == "stitch-text":
|
500 |
+
stitch_text_from_json(args.json_file, gpt_fix=getattr(args, 'gpt_fix', False))
|
501 |
+
return
|
502 |
+
|
503 |
+
if args.command == "azure-baseline":
|
504 |
+
extract_with_azure(
|
505 |
+
args.input_files,
|
506 |
+
args.output_dir,
|
507 |
+
output_format=args.format,
|
508 |
+
verbose=True
|
509 |
+
)
|
510 |
+
return
|
511 |
+
|
512 |
+
if args.command == "azure-overlay-html":
|
513 |
+
generate_azure_overlay_html(args.image, args.json, args.output)
|
514 |
+
return
|
515 |
+
|
516 |
+
# Default: process
|
517 |
+
valid_files = []
|
518 |
+
for file_path in args.input_files:
|
519 |
+
if not os.path.exists(file_path):
|
520 |
+
print(f"Warning: File not found: {file_path}", file=sys.stderr)
|
521 |
+
else:
|
522 |
+
valid_files.append(file_path)
|
523 |
+
if not valid_files:
|
524 |
+
print("Error: No valid input files provided", file=sys.stderr)
|
525 |
+
sys.exit(1)
|
526 |
+
process_files(valid_files, args.output_dir, output_format=args.format, debug=args.debug)
|
527 |
+
|
528 |
+
|
529 |
+
if __name__ == '__main__':
|
530 |
+
main()
|
smoldocling/overlays.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
from PIL import Image
|
4 |
+
import os
|
5 |
+
import base64
|
6 |
+
|
7 |
+
HTML_TEMPLATE = '''<!DOCTYPE html>
|
8 |
+
<html lang="en">
|
9 |
+
<head>
|
10 |
+
<meta charset="UTF-8">
|
11 |
+
<title>Document Overlay</title>
|
12 |
+
<style>
|
13 |
+
.overlay-container {{
|
14 |
+
position: relative;
|
15 |
+
width: {img_width}px;
|
16 |
+
height: {img_height}px;
|
17 |
+
background: url('{img_src}') no-repeat;
|
18 |
+
background-size: 100% 100%;
|
19 |
+
border: 1px solid #ccc;
|
20 |
+
}}
|
21 |
+
.word-box {{
|
22 |
+
position: absolute;
|
23 |
+
border: 1px solid #e74c3c;
|
24 |
+
background: rgba(255,255,0,0.3);
|
25 |
+
font-size: 12px;
|
26 |
+
color: #222;
|
27 |
+
padding: 0;
|
28 |
+
margin: 0;
|
29 |
+
line-height: 1;
|
30 |
+
pointer-events: none;
|
31 |
+
white-space: pre;
|
32 |
+
overflow: hidden;
|
33 |
+
}}
|
34 |
+
</style>
|
35 |
+
</head>
|
36 |
+
<body>
|
37 |
+
<div class="overlay-container">
|
38 |
+
{boxes}
|
39 |
+
</div>
|
40 |
+
</body>
|
41 |
+
</html>
|
42 |
+
'''
|
43 |
+
|
44 |
+
def load_image_size(image_path):
|
45 |
+
with Image.open(image_path) as img:
|
46 |
+
return img.width, img.height
|
47 |
+
|
48 |
+
def extract_words(json_data):
|
49 |
+
# Azure Document Intelligence v4 layout: words are in pages[x]['words']
|
50 |
+
words = []
|
51 |
+
for page in json_data.get('pages', []):
|
52 |
+
for word in page.get('words', []):
|
53 |
+
text = word.get('content', '')
|
54 |
+
polygon = word.get('polygon', [])
|
55 |
+
if len(polygon) == 8: # 4 points (x0,y0,...,x3,y3)
|
56 |
+
words.append({'text': text, 'polygon': polygon})
|
57 |
+
return words
|
58 |
+
|
59 |
+
def polygon_to_bbox(polygon):
|
60 |
+
xs = polygon[0::2]
|
61 |
+
ys = polygon[1::2]
|
62 |
+
x_min, x_max = min(xs), max(xs)
|
63 |
+
y_min, y_max = min(ys), max(ys)
|
64 |
+
return x_min, y_min, x_max, y_max
|
65 |
+
|
66 |
+
def scale_polygon(polygon, scale_x, scale_y):
|
67 |
+
return [polygon[i] * (scale_x if i % 2 == 0 else scale_y) for i in range(8)]
|
68 |
+
|
69 |
+
def generate_azure_overlay_html(image_path, json_path, output_path):
|
70 |
+
# Load image size
|
71 |
+
img_width, img_height = load_image_size(image_path)
|
72 |
+
|
73 |
+
# Load JSON
|
74 |
+
with open(json_path, 'r') as f:
|
75 |
+
data = json.load(f)
|
76 |
+
|
77 |
+
# Get page dimensions from JSON (assume first page)
|
78 |
+
page = data['pages'][0]
|
79 |
+
doc_width = page.get('width', img_width)
|
80 |
+
doc_height = page.get('height', img_height)
|
81 |
+
unit = page.get('unit', 'pixel')
|
82 |
+
|
83 |
+
# Compute scaling factors
|
84 |
+
scale_x = img_width / doc_width
|
85 |
+
scale_y = img_height / doc_height
|
86 |
+
|
87 |
+
# Extract words
|
88 |
+
words = extract_words(data)
|
89 |
+
|
90 |
+
# Generate HTML boxes
|
91 |
+
boxes = []
|
92 |
+
for word in words:
|
93 |
+
poly = word['polygon']
|
94 |
+
scaled_poly = scale_polygon(poly, scale_x, scale_y)
|
95 |
+
x0, y0, x2, y2 = scaled_poly[0], scaled_poly[1], scaled_poly[4], scaled_poly[5]
|
96 |
+
left = x0
|
97 |
+
top = y0
|
98 |
+
width = x2 - x0
|
99 |
+
height = y2 - y0
|
100 |
+
# Fallback for negative width/height
|
101 |
+
width = abs(width)
|
102 |
+
height = abs(height)
|
103 |
+
style = f"left:{left:.2f}px;top:{top:.2f}px;width:{width:.2f}px;height:{height:.2f}px;"
|
104 |
+
box_html = f'<span class="word-box" style="{style}">{word["text"]}</span>'
|
105 |
+
boxes.append(box_html)
|
106 |
+
|
107 |
+
# Use relative path for image in HTML
|
108 |
+
img_src = os.path.relpath(image_path, os.path.dirname(output_path))
|
109 |
+
|
110 |
+
html = HTML_TEMPLATE.format(
|
111 |
+
img_width=img_width,
|
112 |
+
img_height=img_height,
|
113 |
+
img_src=img_src,
|
114 |
+
boxes='\n'.join(boxes)
|
115 |
+
)
|
116 |
+
|
117 |
+
with open(output_path, 'w') as f:
|
118 |
+
f.write(html)
|
119 |
+
print(f"Overlay HTML written to {output_path}")
|
120 |
+
|
121 |
+
def generate_docling_overlay(image_path, json_path, output_path):
|
122 |
+
"""
|
123 |
+
Generate an HTML file overlaying bounding boxes from the JSON on the image, with tooltips showing the extracted text on hover.
|
124 |
+
Returns the HTML content as a string.
|
125 |
+
"""
|
126 |
+
# Load image and encode as base64
|
127 |
+
with open(image_path, "rb") as img_f:
|
128 |
+
img_bytes = img_f.read()
|
129 |
+
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
|
130 |
+
from PIL import Image as PILImage
|
131 |
+
img = PILImage.open(image_path)
|
132 |
+
img_width, img_height = img.size
|
133 |
+
|
134 |
+
# Load JSON
|
135 |
+
with open(json_path, "r") as f:
|
136 |
+
doc = json.load(f)
|
137 |
+
|
138 |
+
# Collect bounding boxes and texts
|
139 |
+
boxes = []
|
140 |
+
# Texts: red
|
141 |
+
for text in doc.get("texts", []):
|
142 |
+
for prov in text.get("prov", []):
|
143 |
+
bbox = prov.get("bbox")
|
144 |
+
if bbox:
|
145 |
+
l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
|
146 |
+
boxes.append({
|
147 |
+
"l": l,
|
148 |
+
"t": t,
|
149 |
+
"r": r,
|
150 |
+
"b": b,
|
151 |
+
"text": text.get("text", ""),
|
152 |
+
"type": "text"
|
153 |
+
})
|
154 |
+
# Pictures: green
|
155 |
+
for pic in doc.get("pictures", []):
|
156 |
+
for prov in pic.get("prov", []):
|
157 |
+
bbox = prov.get("bbox")
|
158 |
+
if bbox:
|
159 |
+
l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
|
160 |
+
boxes.append({
|
161 |
+
"l": l,
|
162 |
+
"t": t,
|
163 |
+
"r": r,
|
164 |
+
"b": b,
|
165 |
+
"text": pic.get("label", "picture"),
|
166 |
+
"type": "picture"
|
167 |
+
})
|
168 |
+
# Groups: blue (enclosing all children)
|
169 |
+
def get_bbox_for_refs(refs, texts_by_ref):
|
170 |
+
# Get all bboxes for the referenced texts (recursively for groups)
|
171 |
+
bboxes = []
|
172 |
+
for ref in refs:
|
173 |
+
if ref["$ref"].startswith("#/texts/"):
|
174 |
+
text = texts_by_ref.get(ref["$ref"])
|
175 |
+
if text:
|
176 |
+
for prov in text.get("prov", []):
|
177 |
+
bbox = prov.get("bbox")
|
178 |
+
if bbox:
|
179 |
+
bboxes.append(bbox)
|
180 |
+
elif ref["$ref"].startswith("#/groups/"):
|
181 |
+
group = groups_by_ref.get(ref["$ref"])
|
182 |
+
if group:
|
183 |
+
bboxes.extend(get_bbox_for_refs(group.get("children", []), texts_by_ref))
|
184 |
+
return bboxes
|
185 |
+
groups_by_ref = {g["self_ref"]: g for g in doc.get("groups", [])}
|
186 |
+
texts_by_ref = {t["self_ref"]: t for t in doc.get("texts", [])}
|
187 |
+
for group in doc.get("groups", []):
|
188 |
+
bboxes = get_bbox_for_refs(group.get("children", []), texts_by_ref)
|
189 |
+
if bboxes:
|
190 |
+
l = min(b["l"] for b in bboxes)
|
191 |
+
t = min(b["t"] for b in bboxes)
|
192 |
+
r = max(b["r"] for b in bboxes)
|
193 |
+
b_ = max(b["b"] for b in bboxes)
|
194 |
+
boxes.append({
|
195 |
+
"l": l,
|
196 |
+
"t": t,
|
197 |
+
"r": r,
|
198 |
+
"b": b_,
|
199 |
+
"text": group.get("label", "group"),
|
200 |
+
"type": "group"
|
201 |
+
})
|
202 |
+
# Build HTML as a list of lines
|
203 |
+
html_lines = [
|
204 |
+
'<!DOCTYPE html>',
|
205 |
+
'<html lang="en">',
|
206 |
+
'<head>',
|
207 |
+
'<meta charset="UTF-8">',
|
208 |
+
f'<title>Overlay for {os.path.basename(image_path)}</title>',
|
209 |
+
'<style>',
|
210 |
+
f'''.container {{
|
211 |
+
position: relative;
|
212 |
+
width: {img_width}px;
|
213 |
+
height: {img_height}px;
|
214 |
+
background: #222;
|
215 |
+
}}
|
216 |
+
.overlay-img {{
|
217 |
+
display: block;
|
218 |
+
width: {img_width}px;
|
219 |
+
height: {img_height}px;
|
220 |
+
}}
|
221 |
+
.bbox {{
|
222 |
+
position: absolute;
|
223 |
+
box-sizing: border-box;
|
224 |
+
cursor: pointer;
|
225 |
+
}}
|
226 |
+
.bbox-text {{
|
227 |
+
border: 2px solid red;
|
228 |
+
}}
|
229 |
+
.bbox-picture {{
|
230 |
+
border: 2px solid green;
|
231 |
+
}}
|
232 |
+
.bbox-group {{
|
233 |
+
border: 2px solid blue;
|
234 |
+
}}
|
235 |
+
.tooltip {{
|
236 |
+
display: none;
|
237 |
+
position: absolute;
|
238 |
+
background: #fff;
|
239 |
+
color: #222;
|
240 |
+
border: 1px solid #888;
|
241 |
+
padding: 6px 10px;
|
242 |
+
border-radius: 4px;
|
243 |
+
z-index: 10;
|
244 |
+
pointer-events: none;
|
245 |
+
max-width: 400px;
|
246 |
+
font-size: 15px;
|
247 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
|
248 |
+
white-space: pre-line;
|
249 |
+
}}''',
|
250 |
+
'</style>',
|
251 |
+
'</head>',
|
252 |
+
'<body>',
|
253 |
+
f'<h2>Overlay for {os.path.basename(image_path)}</h2>',
|
254 |
+
f'<div class="container" id="img-container">',
|
255 |
+
f' <img src="data:image/png;base64,{img_b64}" class="overlay-img" alt="source image">'
|
256 |
+
]
|
257 |
+
# Add bounding boxes
|
258 |
+
for i, box in enumerate(boxes):
|
259 |
+
left = box["l"]
|
260 |
+
top = box["t"]
|
261 |
+
width = box["r"] - box["l"]
|
262 |
+
height = box["b"] - box["t"]
|
263 |
+
text = box["text"].replace('"', '"').replace("'", "'")
|
264 |
+
box_class = f"bbox bbox-{box['type']}"
|
265 |
+
html_lines.append(f'<div class="{box_class}" style="left:{left}px;top:{top}px;width:{width}px;height:{height}px;" data-tooltip="{text}" onmousemove="showTooltip(event, {i})" onmouseleave="hideTooltip()"></div>')
|
266 |
+
html_lines.append('<div class="tooltip" id="tooltip"></div>')
|
267 |
+
html_lines.append('</div>')
|
268 |
+
html_lines.append('''<script>
|
269 |
+
const tooltip = document.getElementById('tooltip');
|
270 |
+
function showTooltip(e, idx) {
|
271 |
+
const bbox = e.target;
|
272 |
+
const text = bbox.getAttribute('data-tooltip');
|
273 |
+
tooltip.innerText = text;
|
274 |
+
tooltip.style.display = 'block';
|
275 |
+
// Position tooltip near mouse, but inside container
|
276 |
+
const container = document.getElementById('img-container');
|
277 |
+
let x = e.clientX - container.getBoundingClientRect().left + 10;
|
278 |
+
let y = e.clientY - container.getBoundingClientRect().top + 10;
|
279 |
+
// Clamp to container
|
280 |
+
x = Math.min(x, container.offsetWidth - tooltip.offsetWidth - 10);
|
281 |
+
y = Math.min(y, container.offsetHeight - tooltip.offsetHeight - 10);
|
282 |
+
tooltip.style.left = x + 'px';
|
283 |
+
tooltip.style.top = y + 'px';
|
284 |
+
}
|
285 |
+
function hideTooltip() {
|
286 |
+
tooltip.style.display = 'none';
|
287 |
+
}
|
288 |
+
</script>''')
|
289 |
+
html_lines.append('</body></html>')
|
290 |
+
html = '\n'.join(html_lines)
|
291 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
292 |
+
f.write(html)
|
293 |
+
print(f"Overlay HTML written to {output_path}")
|
294 |
+
return html
|
295 |
+
|
296 |
+
def main():
|
297 |
+
parser = argparse.ArgumentParser(description="Generate HTML overlay for Azure Document Intelligence output.")
|
298 |
+
parser.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
|
299 |
+
parser.add_argument('--image', required=True, help='Path to scanned image file')
|
300 |
+
parser.add_argument('--output', required=True, help='Path to output HTML file')
|
301 |
+
args = parser.parse_args()
|
302 |
+
generate_azure_overlay_html(args.image, args.json, args.output)
|
303 |
+
|
304 |
+
if __name__ == '__main__':
|
305 |
+
main()
|
smoldocling/pyproject.toml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools>=45", "wheel"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[project]
|
6 |
+
name = "smoldocling"
|
7 |
+
version = "0.1.0"
|
8 |
+
description = "CLI tool for processing document images using Smoldocling"
|
9 |
+
authors = [
|
10 |
+
{name = "Your Name", email = "[email protected]"},
|
11 |
+
]
|
12 |
+
dependencies = [
|
13 |
+
"docling-core",
|
14 |
+
# "mlx-vlm",
|
15 |
+
"Pillow>=10.0.0",
|
16 |
+
"pdf2image>=1.16.3",
|
17 |
+
"mcp[cli]>=1.7.0",
|
18 |
+
"fastapi[standard]>=0.115.12",
|
19 |
+
"torch>=2.7.0",
|
20 |
+
"openai>=1.78.1",
|
21 |
+
]
|
22 |
+
requires-python = ">=3.10"
|
23 |
+
|
24 |
+
[project.scripts]
|
25 |
+
smoldocling = "smoldocling.cli:main"
|
26 |
+
|
27 |
+
[tool.setuptools]
|
28 |
+
packages = ["smoldocling"]
|
smoldocling/server.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mcp.server.fastmcp import FastMCP
|
2 |
+
from fastapi import FastAPI
|
3 |
+
from smoldocling.cli import process_files
|
4 |
+
import json
|
5 |
+
|
6 |
+
# app = FastAPI()
|
7 |
+
mcp = FastMCP("Smoldocling Document Extractor")
|
8 |
+
|
9 |
+
@mcp.tool()
|
10 |
+
def extract_document(file_path: str) -> dict:
|
11 |
+
"""
|
12 |
+
Extract text and structure from a document at the given file path.
|
13 |
+
Returns a dictionary with the extracted document information as JSON.
|
14 |
+
"""
|
15 |
+
result = process_files([file_path], output_dir=None, output_format="json", verbose=False)
|
16 |
+
return result
|
17 |
+
|
18 |
+
# app.mount("/mcp", mcp)
|
19 |
+
if __name__ == "__main__":
|
20 |
+
mcp.run(transport='stdio')
|
smoldocling/test_server.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from mcp import ClientSession, StdioServerParameters
|
3 |
+
from mcp.client.stdio import stdio_client
|
4 |
+
import os
|
5 |
+
|
6 |
+
async def main():
|
7 |
+
# Start the server using the same command as before
|
8 |
+
server_params = StdioServerParameters(
|
9 |
+
command="python",
|
10 |
+
args=["smoldocling/server.py"],
|
11 |
+
env=os.environ.copy(),
|
12 |
+
)
|
13 |
+
|
14 |
+
async with stdio_client(server_params) as (read, write):
|
15 |
+
async with ClientSession(read, write) as session:
|
16 |
+
# Initialize the connection
|
17 |
+
await session.initialize()
|
18 |
+
|
19 |
+
# List available tools
|
20 |
+
tools = await session.list_tools()
|
21 |
+
print("[DEBUG] Available tools:", tools)
|
22 |
+
|
23 |
+
# Call the extract_document tool
|
24 |
+
result = await session.call_tool(
|
25 |
+
"extract_document",
|
26 |
+
arguments={"file_path": "input/p2.png"}
|
27 |
+
)
|
28 |
+
print("[DEBUG] extract_document result:", result)
|
29 |
+
|
30 |
+
if __name__ == "__main__":
|
31 |
+
asyncio.run(main())
|
smoldocling/testrun.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append('../')
|
3 |
+
from smoldocling import cli
|
4 |
+
from ipywidgets import HTML
|
5 |
+
import dotenv
|
6 |
+
|
7 |
+
output_png = '../data/legislatures/AZ_h_1913_apr_special_p9.png'
|
8 |
+
output_dir = '../output/'
|
9 |
+
|
10 |
+
cli.process_files([output_png], output_dir, output_format="json")
|
11 |
+
|
12 |
+
fileName = output_png[output_png.rfind("/")+1:].replace(".png",'')
|
13 |
+
json_output = output_dir + fileName + ".json"
|
14 |
+
overlay_html = output_dir + fileName + "_overlay.html"
|
15 |
+
|
16 |
+
html_output = cli.generate_docling_overlay(output_png, json_output, overlay_html)
|
17 |
+
|
18 |
+
dotenv.load_dotenv()
|
19 |
+
|
20 |
+
cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False)
|