jobian commited on
Commit
ceaf2e8
·
1 Parent(s): c6954d9

Added Smoldocling Package and implemeted it's first test /parse

Browse files
Dockerfile CHANGED
@@ -1,16 +1,16 @@
1
- # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
- # you will also find guides on how best to write your Dockerfile
3
 
4
- FROM python:3.9
5
-
6
- RUN useradd -m -u 1000 user
7
- USER user
8
- ENV PATH="/home/user/.local/bin:$PATH"
9
 
10
  WORKDIR /app
11
 
12
- COPY --chown=user ./requirements.txt requirements.txt
13
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 
 
 
 
 
14
 
15
- COPY --chown=user . /app
16
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Dockerfile
 
2
 
3
+ FROM python:3.10-slim
 
 
 
 
4
 
5
  WORKDIR /app
6
 
7
+ # Copy everything including smoldocling
8
+ COPY . .
9
+
10
+ # Install requirements (editable install of local package)
11
+ RUN pip install --no-cache-dir -r requirements.txt && \
12
+ pip install -e ./smoldocling
13
+
14
+ EXPOSE 7860
15
 
 
16
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,8 +1,61 @@
1
- from fastapi import FastAPI
 
 
 
 
 
 
 
2
 
 
 
 
 
3
  app = FastAPI()
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- @app.get("/")
7
- def greet_json():
8
- return {"Hello": "World!"}
 
 
 
1
+ import os
2
+ from fastapi import FastAPI, UploadFile, File, HTTPException
3
+ from fastapi.responses import JSONResponse
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from typing import List
6
+ from smoldocling import cli
7
+ import shutil
8
+ import dotenv
9
 
10
+ # Load environment variables
11
+ dotenv.load_dotenv()
12
+
13
+ # Initialize FastAPI app
14
  app = FastAPI()
15
 
16
+ # Enable CORS (optional, but good for dev/testing)
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=["*"],
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+ # Ensure directories exist
25
+ UPLOAD_DIR = "uploads"
26
+ OUTPUT_DIR = "output"
27
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+
31
+ def docling_process_files(file_list: List[str]) -> str:
32
+ cli.process_files(file_list, OUTPUT_DIR, output_format='json')
33
+
34
+ file_path = file_list[0].replace('\\', '/')
35
+ file_name = os.path.splitext(os.path.basename(file_path))[0]
36
+
37
+ json_output = os.path.join(OUTPUT_DIR, f"{file_name}.json")
38
+ overlay_html = os.path.join(OUTPUT_DIR, f"{file_name}_overlay.html")
39
+
40
+ # Generate overlay (optional)
41
+ cli.generate_docling_overlay(file_path, json_output, overlay_html)
42
+
43
+ # Stitch final cleaned text (you can toggle GPT fixing)
44
+ cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False)
45
+ return cleaned_text
46
+
47
+
48
+ @app.post("/parse")
49
+ async def parse_docling(file: UploadFile = File(...)):
50
+ if not file:
51
+ raise HTTPException(status_code=400, detail="No file uploaded.")
52
+
53
+ save_path = os.path.join(UPLOAD_DIR, file.filename)
54
+ with open(save_path, "wb") as buffer:
55
+ shutil.copyfileobj(file.file, buffer)
56
 
57
+ try:
58
+ text_output = docling_process_files([save_path])
59
+ return JSONResponse(content={"text": text_output})
60
+ except Exception as e:
61
+ return JSONResponse(status_code=500, content={"error": str(e)})
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  fastapi
2
  uvicorn[standard]
 
 
 
1
  fastapi
2
  uvicorn[standard]
3
+ python-multipart
4
+ -e ./smoldocling
smoldocling/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Smoldocling CLI
2
+
3
+ A command-line interface for processing document images and PDFs using Smoldocling-256-preview model.
4
+
5
+ ## Installation
6
+
7
+ 1. Clone this repository
8
+ 2. Install the required dependencies:
9
+ ```bash
10
+ pip install -r requirements.txt
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ The CLI supports processing one or multiple document images and PDFs at once. The processed output will be saved as HTML files.
16
+
17
+ Basic usage:
18
+ ```bash
19
+ python smoldocling_cli.py input1.png input2.jpg input3.pdf
20
+ ```
21
+
22
+ Specify output directory:
23
+ ```bash
24
+ python smoldocling_cli.py -o custom_output input1.png document.pdf
25
+ ```
26
+
27
+ ### Arguments
28
+
29
+ - `input_files`: One or more input files (images or PDFs) to process
30
+ - `-o, --output-dir`: Output directory for HTML files (default: 'output')
31
+
32
+ ### Example
33
+
34
+ ```bash
35
+ python smoldocling_cli.py document1.png document2.pdf -o processed_docs
36
+ ```
37
+
38
+ This will:
39
+ 1. Process document1.png and generate document1.html
40
+ 2. Process document2.pdf and generate document2.html (containing all pages in a single file)
41
+ 3. Create a directory called 'processed_docs' if it doesn't exist
42
+ 4. Save all HTML files in the processed_docs directory
43
+
44
+ ## Notes
45
+
46
+ - The script will automatically create the output directory if it doesn't exist
47
+ - Each input image file will generate a corresponding HTML file with the same name (but .html extension)
48
+ - PDF files will generate a single HTML file containing all pages
49
+ - Currently, PDF processing is limited to the first 3 pages due to model limitations
50
+ - Failed processing of one file won't stop the processing of other files
51
+ - Error messages will be printed to stderr
52
+ - The model is loaded only once for processing multiple files
53
+
54
+
smoldocling/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ Smoldocling CLI package for processing document images.
3
+ """
4
+
5
+ __version__ = "0.1.0"
smoldocling/cli.py ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+ from PIL import Image, ImageDraw
7
+ from docling_core.types.doc import DoclingDocument, ImageRefMode
8
+ from docling_core.types.doc.document import DocTagsDocument
9
+ import torch
10
+ from transformers import AutoProcessor, AutoModelForVision2Seq
11
+ from transformers.image_utils import load_image
12
+ import sys
13
+ from pdf2image import convert_from_path
14
+ import tempfile
15
+ import json
16
+ import matplotlib.pyplot as plt
17
+ from pprint import pprint
18
+ import base64
19
+ from dotenv import load_dotenv
20
+ import openai
21
+ from azure.ai.documentintelligence import DocumentIntelligenceClient
22
+ from azure.core.credentials import AzureKeyCredential
23
+ from smoldocling.overlays import generate_azure_overlay_html, generate_docling_overlay
24
+ from PIL import Image
25
+ import requests
26
+ from io import BytesIO
27
+
28
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
29
+
30
+ load_dotenv()
31
+
32
+
33
+ def load_model(verbose=True):
34
+ """Load the Smoldocling model and return model and processor."""
35
+ if verbose:
36
+ print("Loading Smoldocling model...")
37
+ model_path = "ds4sd/SmolDocling-256M-preview"
38
+ processor = AutoProcessor.from_pretrained(model_path)
39
+ model = AutoModelForVision2Seq.from_pretrained(
40
+ model_path,
41
+ torch_dtype=torch.float16, # Use float16 for T4 GPU
42
+ ).to(DEVICE)
43
+ return model, processor
44
+
45
+
46
+ def run_model(model, processor, image, prompt="Convert this page to docling.", verbose=True):
47
+ """Run the Smoldocling model with the given image and prompt and return the doctags."""
48
+ # Prepare inputs
49
+ messages = [
50
+ {
51
+ "role": "user",
52
+ "content": [
53
+ {"type": "image"},
54
+ {"type": "text", "text": prompt}
55
+ ]
56
+ },
57
+ ]
58
+ formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
59
+ inputs = processor(
60
+ text=formatted_prompt,
61
+ images=[image],
62
+ return_tensors="pt",
63
+ truncation=True, # ✅ Avoid truncation warning
64
+ ).to(DEVICE)
65
+
66
+ # Generate output
67
+ if verbose:
68
+ print("Generating text...")
69
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
70
+ prompt_length = inputs.input_ids.shape[1]
71
+ trimmed_generated_ids = generated_ids[:, prompt_length:]
72
+ return processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
73
+
74
+
75
+ def extract_text_from_document(image_path, model, processor, output_format="html", verbose=True):
76
+ """Extract text from a document image using Smoldocling-256."""
77
+ try:
78
+ # Load and preprocess the image
79
+ image = Image.open(image_path)
80
+
81
+ if verbose:
82
+ print(f"Processing {image_path}")
83
+ print(f"Image mode: {image.mode}")
84
+ print(f"Image size: {image.size}")
85
+
86
+ # Run docling vlm
87
+ output = run_model(model, processor, image, verbose=verbose)
88
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
89
+ [output],
90
+ [image]
91
+ )
92
+ doc = DoclingDocument(name=Path(image_path).stem).load_from_doctags(doctags_doc)
93
+
94
+ # Handle formatting and export
95
+ if output_format == "json":
96
+ # Export to dict (no images)
97
+ doc_dict = doc.export_to_dict()
98
+ # Remove images from the dict if present
99
+ if "pictures" in doc_dict:
100
+ for picture in doc_dict["pictures"]:
101
+ if "image" in picture:
102
+ if "uri" in picture["image"]:
103
+ del picture["image"]["uri"]
104
+ return doc_dict
105
+ else:
106
+ html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
107
+ return html_output
108
+
109
+ except Exception as e:
110
+ if verbose:
111
+ print(f"Error processing 1: {image_path}: {str(e)}", file=sys.stderr)
112
+ return None
113
+
114
+
115
+ def process_pdf(pdf_path, model, processor, output_dir, output_format="html", debug=False, verbose=True):
116
+ """Process a PDF file by converting it to images and processing each page."""
117
+ try:
118
+ if verbose:
119
+ print(f"\nProcessing PDF: {pdf_path}")
120
+ # Convert PDF to images
121
+ with tempfile.TemporaryDirectory() as temp_dir:
122
+ if verbose:
123
+ print("Converting PDF to images...")
124
+ # TODO: Review this. It's not working when the PDF is large.
125
+ images = convert_from_path(
126
+ pdf_path,
127
+ output_folder=temp_dir,
128
+ first_page=1,
129
+ fmt="png"
130
+ )
131
+ if not images:
132
+ if verbose:
133
+ print(f"No pages found in PDF: {pdf_path}", file=sys.stderr)
134
+ return
135
+ all_doctags = []
136
+ all_images = []
137
+ for i, image in enumerate(images, start=1):
138
+ image_path = os.path.join(temp_dir, f"page_{i}.png")
139
+ image.save(image_path, "PNG")
140
+ if verbose:
141
+ print(f"\nProcessing page {i}")
142
+ try:
143
+ image = Image.open(image_path)
144
+ if verbose:
145
+ print(f"Processing {image_path}")
146
+ print(f"Image mode: {image.mode}")
147
+ print(f"Image size: {image.size}")
148
+ output = run_model(model, processor, image, verbose=verbose)
149
+ cleaned_output = output.replace("<end_of_utterance>", "").strip()
150
+ # If you have charts:
151
+ if "<chart>" in cleaned_output:
152
+ cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
153
+ all_doctags.append(cleaned_output)
154
+ all_images.append(image)
155
+ if verbose:
156
+ print(f"Successfully processed page {i}")
157
+ # DEBUG: Dump per-page JSON if requested
158
+ if debug and output_dir is not None:
159
+ # Create a single-page DocTagsDocument and DoclingDocument
160
+ doctags_doc_page = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], [image])
161
+ doc_page = DoclingDocument(name=f"{Path(pdf_path).stem}_p{i}")
162
+ doc_page.load_from_doctags(doctags_doc_page)
163
+ doc_dict_page = doc_page.export_to_dict()
164
+ # Remove images from the dict if present
165
+ if "pages" in doc_dict_page:
166
+ for page in doc_dict_page["pages"]:
167
+ if "image" in page:
168
+ page["image"] = None
169
+ page_json_path = Path(output_dir) / f"{Path(pdf_path).stem}_p{i}.json"
170
+ with open(page_json_path, 'w', encoding='utf-8') as f:
171
+ json.dump(doc_dict_page, f, ensure_ascii=False, indent=2)
172
+ if verbose:
173
+ print(f"[DEBUG] Dumped page {i} JSON to {page_json_path}")
174
+ except Exception as e:
175
+ if verbose:
176
+ print(f"Error processing page {i}: {str(e)}", file=sys.stderr)
177
+ if all_doctags and all_images:
178
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
179
+ all_doctags,
180
+ all_images
181
+ )
182
+ doc = DoclingDocument(name=Path(pdf_path).stem)
183
+ doc.load_from_doctags(doctags_doc)
184
+ if output_format == "json":
185
+ doc_dict = doc.export_to_dict()
186
+ if "pages" in doc_dict:
187
+ for page in doc_dict["pages"]:
188
+ if "image" in page:
189
+ page["image"] = None
190
+ if output_dir is None:
191
+ return doc_dict
192
+ output_filename = f"{Path(pdf_path).stem}.json"
193
+ output_path = Path(output_dir) / output_filename
194
+ with open(output_path, 'w', encoding='utf-8') as f:
195
+ json.dump(doc_dict, f, ensure_ascii=False, indent=2)
196
+ if verbose:
197
+ print(f"\nSuccessfully saved combined output to {output_path}")
198
+ else:
199
+ html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
200
+ if output_dir is None:
201
+ return html_output
202
+ output_filename = f"{Path(pdf_path).stem}.html"
203
+ output_path = Path(output_dir) / output_filename
204
+ with open(output_path, 'w', encoding='utf-8') as f:
205
+ f.write(html_output)
206
+ if verbose:
207
+ print(f"\nSuccessfully saved combined output to {output_path}")
208
+ else:
209
+ if verbose:
210
+ print("No pages were successfully processed", file=sys.stderr)
211
+ except Exception as e:
212
+ if verbose:
213
+ print(f"Error processing PDF {pdf_path}: {str(e)}", file=sys.stderr)
214
+
215
+
216
+ def process_files(input_files, output_dir, output_format="html", debug=False, verbose=True):
217
+ """Process multiple input files and generate outputs in the specified format."""
218
+ if output_dir is not None:
219
+ os.makedirs(output_dir, exist_ok=True)
220
+
221
+ model, processor = load_model(verbose=verbose)
222
+ results = []
223
+
224
+ for input_file in input_files:
225
+ try:
226
+ input_path = Path(input_file)
227
+ if input_path.suffix.lower() == '.pdf':
228
+ if output_dir is None:
229
+ # Collect results instead of writing to files
230
+ pdf_result = process_pdf(input_file, model, processor, None, output_format=output_format, debug=debug, verbose=verbose)
231
+ if pdf_result:
232
+ results.extend(pdf_result)
233
+ else:
234
+ process_pdf(input_file, model, processor, output_dir, output_format=output_format, debug=debug, verbose=verbose)
235
+ else:
236
+ if verbose:
237
+ print(f"\nProcessing: {input_file}")
238
+
239
+ doc_dict = extract_text_from_document(input_path, model, processor, output_format=output_format, verbose=verbose)
240
+ if doc_dict:
241
+ if output_dir is None:
242
+ results.append(doc_dict)
243
+ else:
244
+ output_path = Path(output_dir) / f"{input_path.stem}.{output_format}"
245
+ if verbose:
246
+ print(f"Output will be saved to: {output_path}")
247
+ with open(output_path, 'w', encoding='utf-8') as f:
248
+ if output_format == "json":
249
+ json.dump(doc_dict, f, ensure_ascii=False, indent=2)
250
+ elif output_format == "html":
251
+ f.write(doc_dict)
252
+ if verbose:
253
+ print(f"Successfully processed {input_file}")
254
+ else:
255
+ if verbose:
256
+ print(f"Failed to process {input_file}", file=sys.stderr)
257
+
258
+ except Exception as e:
259
+ if verbose:
260
+ print(f"Error processing 2 {input_file}: {str(e)}", file=sys.stderr)
261
+
262
+ if output_dir is None:
263
+ return results
264
+
265
+
266
+ def visualize_doc(doc_path, page_num=0):
267
+ """
268
+ Visualize a document (PDF or image) with bounding boxes from its corresponding JSON annotation.
269
+
270
+ Args:
271
+ doc_path (str): Path to the input document file (PDF or image)
272
+ page_num (int): Page number to visualize for PDFs (default 0)
273
+ """
274
+ # Load document
275
+ if doc_path.lower().endswith('.pdf'):
276
+ # Handle PDF with pdf2image
277
+ # pdf_doc = fitz.open(doc_path)
278
+ # page = pdf_doc[page_num]
279
+ # pix = page.get_pixmap()
280
+ # image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
281
+ images = convert_from_path(doc_path, first_page=1)
282
+ image = images[page_num]
283
+ else:
284
+ # Handle image
285
+ image = Image.open(doc_path).convert("RGB")
286
+
287
+ # Load corresponding JSON
288
+ json_path = doc_path.replace("input", "output").replace(".png", ".json")
289
+ if doc_path.lower().endswith('.pdf'):
290
+ # For PDFs, append page number to JSON filename
291
+ json_path = json_path.replace(".pdf", f"_p{page_num+1}.json")
292
+ with open(json_path, "r") as f:
293
+ doc = json.load(f)
294
+
295
+ # Collect all bounding boxes from texts and pictures
296
+ bboxes = []
297
+ labels = []
298
+
299
+ for text in doc.get("texts", []):
300
+ for prov in text.get("prov", []):
301
+ # Only process boxes from specified page for PDFs
302
+ # if doc_path.lower().endswith('.pdf') and prov.get("page_no") != page_num + 1:
303
+ if doc_path.lower().endswith('.pdf') and prov.get("page_no") != 1: # currently only works for first page
304
+ continue
305
+
306
+ bbox = prov.get("bbox")
307
+ if bbox:
308
+ bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
309
+ labels.append(text.get("label", ""))
310
+
311
+ for pic in doc.get("pictures", []):
312
+ for prov in pic.get("prov", []):
313
+ # Only process boxes from specified page for PDFs
314
+ # if doc_path.lower().endswith('.pdf') and prov.get("page_no") != page_num + 1:
315
+ if doc_path.lower().endswith('.pdf') and prov.get("page_no") != 1: # currently only works for first page
316
+ continue
317
+
318
+ bbox = prov.get("bbox")
319
+ if bbox:
320
+ bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
321
+ labels.append(pic.get("label", "picture"))
322
+
323
+ for table in doc.get("tables", []):
324
+ for prov in table.get("prov", []):
325
+ bbox = prov.get("bbox")
326
+ if bbox:
327
+ bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
328
+ labels.append(table.get("label", ""))
329
+
330
+ # Draw bounding boxes
331
+ draw = ImageDraw.Draw(image)
332
+ for (l, t, r, b), label in zip(bboxes, labels):
333
+ draw.rectangle([l, t, r, b], outline="red", width=2)
334
+ if label:
335
+ draw.text((l, t-10), f"{label} ({l:.1f}, {t:.1f}, {r:.1f}, {b:.1f})", fill="red")
336
+
337
+ # Display
338
+ plt.figure(figsize=(10, 12))
339
+ plt.imshow(image)
340
+ plt.axis("off")
341
+ plt.show()
342
+
343
+
344
+ def stitch_text_from_json(json_path, gpt_fix=False):
345
+ """
346
+ Given a JSON file in the DoclingDocument format, stitch together all text fragments in the order specified in the body and group sections.
347
+ Print the result as plain text. Optionally send to GPT to fix line breaks and hyphenation.
348
+ Returns the stitched (and optionally cleaned) text as a string.
349
+ """
350
+ stitched_text = None
351
+ with open(json_path, 'r', encoding='utf-8') as f:
352
+ doc = json.load(f)
353
+
354
+ texts = doc.get('texts', [])
355
+ groups = doc.get('groups', [])
356
+ body = doc.get('body', {})
357
+
358
+ # Build lookup tables
359
+ texts_by_ref = {f"#/texts/{i}": t for i, t in enumerate(texts)}
360
+ groups_by_ref = {g['self_ref']: g for g in groups}
361
+
362
+ def extract_texts(children):
363
+ result = []
364
+ for child in children:
365
+ ref = child.get('$ref')
366
+ if ref is None:
367
+ continue
368
+ if ref.startswith('#/texts/'):
369
+ text_obj = texts_by_ref.get(ref)
370
+ if text_obj:
371
+ text = text_obj.get('text', '')
372
+ if text:
373
+ result.append(text)
374
+ elif ref.startswith('#/groups/'):
375
+ group_obj = groups_by_ref.get(ref)
376
+ if group_obj:
377
+ result.extend(extract_texts(group_obj.get('children', [])))
378
+ return result
379
+
380
+ stitched_texts = extract_texts(body.get('children', []))
381
+ final_text = '\n'.join(stitched_texts)
382
+
383
+ if gpt_fix:
384
+ try:
385
+ api_key = os.environ.get('OPENAI_API_KEY')
386
+ if not api_key:
387
+ print("OPENAI_API_KEY not set. Printing original stitched text.", file=sys.stderr)
388
+ print(final_text)
389
+ return final_text
390
+ client = openai.OpenAI(api_key=api_key)
391
+ prompt = (
392
+ "You are a helpful assistant. "
393
+ "The following text was extracted from a document and may contain odd line breaks, hyphenated words split across lines, or other OCR artifacts. "
394
+ "Please rewrite the text as clean, readable prose, fixing line breaks, joining hyphenated words, and correcting obvious errors, but do not add or remove content.\n\n"
395
+ f"Text to fix:\n\n{final_text}\n\nCleaned text:"
396
+ )
397
+ response = client.chat.completions.create(
398
+ model="gpt-4o-mini",
399
+ messages=[{"role": "user", "content": prompt}],
400
+ max_tokens=4096,
401
+ temperature=0.0,
402
+ )
403
+ cleaned_text = response.choices[0].message.content.strip()
404
+ print(cleaned_text)
405
+ return cleaned_text
406
+ except Exception as e:
407
+ print(f"[GPT-fix error] {e}. Printing original stitched text.", file=sys.stderr)
408
+ print(final_text)
409
+ return final_text
410
+ else:
411
+ print(final_text)
412
+ return final_text
413
+
414
+
415
+ def extract_with_azure(input_files, output_dir, output_format="json", verbose=True):
416
+ endpoint = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
417
+ key = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_KEY")
418
+ if not endpoint or not key:
419
+ print("Azure endpoint/key not set. Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and AZURE_DOCUMENT_INTELLIGENCE_KEY in your environment.", file=sys.stderr)
420
+ return
421
+
422
+ client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))
423
+ os.makedirs(output_dir, exist_ok=True)
424
+
425
+ for input_file in input_files:
426
+ with open(input_file, "rb") as f:
427
+ file_bytes = f.read()
428
+ poller = client.begin_analyze_document(
429
+ model_id="prebuilt-layout",
430
+ body={"base64Source": base64.b64encode(file_bytes).decode("utf-8")}
431
+ )
432
+ result = poller.result()
433
+ output_path = Path(output_dir) / (Path(input_file).stem + ".json")
434
+ with open(output_path, "w", encoding="utf-8") as out_f:
435
+ json.dump(result.as_dict(), out_f, ensure_ascii=False, indent=2)
436
+ if verbose:
437
+ print(f"Azure baseline output saved to {output_path}")
438
+
439
+
440
+ def main():
441
+ parser = argparse.ArgumentParser(
442
+ description="Process document images and PDFs using Smoldocling and generate HTML or JSON outputs"
443
+ )
444
+ subparsers = parser.add_subparsers(dest="command", required=False)
445
+
446
+ # Default parser for main processing
447
+ parser_main = subparsers.add_parser("process", help="Process images or PDFs to HTML/JSON (default)")
448
+ parser_main.add_argument(
449
+ 'input_files', nargs='+', help='One or more input files (images or PDFs) to process'
450
+ )
451
+ parser_main.add_argument(
452
+ '-o', '--output-dir', default='output', help='Output directory for result files (default: output)'
453
+ )
454
+ parser_main.add_argument(
455
+ '--format', choices=['html', 'json'], default='html', help='Output format: html or json (default: html)'
456
+ )
457
+ parser_main.add_argument(
458
+ '--debug', action='store_true', help='Enable debug mode: dump each PDF page as a separate JSON file.'
459
+ )
460
+
461
+ # Overlay HTML subcommand
462
+ parser_overlay = subparsers.add_parser("overlay-html", help="Generate HTML overlay from PNG and JSON")
463
+ parser_overlay.add_argument('image_file', help='Source PNG image file')
464
+ parser_overlay.add_argument('json_file', help='Extracted JSON file with bounding boxes')
465
+ parser_overlay.add_argument('-o', '--output', help='Output HTML file (default: <image_file>_overlay.html)')
466
+
467
+ # Stitch text subcommand
468
+ parser_stitch = subparsers.add_parser("stitch-text", help="Stitch together text fragments from a JSON file and print as plain text")
469
+ parser_stitch.add_argument('json_file', help='Extracted JSON file to stitch')
470
+ parser_stitch.add_argument('--gpt-fix', action='store_true', help='Send stitched text to GPT to fix line breaks and hyphenation')
471
+
472
+ # Azure baseline subcommand
473
+ parser_azure = subparsers.add_parser(
474
+ "azure-baseline", help="Extract content using Azure Document Intelligence for baseline comparison"
475
+ )
476
+ parser_azure.add_argument(
477
+ 'input_files', nargs='+', help='One or more input files (images or PDFs) to process with Azure Document Intelligence'
478
+ )
479
+ parser_azure.add_argument(
480
+ '-o', '--output-dir', default='output_azure', help='Output directory for Azure baseline result files (default: output_azure)'
481
+ )
482
+ parser_azure.add_argument(
483
+ '--format', choices=['json'], default='json', help='Output format: json (default: json)'
484
+ )
485
+
486
+ # Azure overlay HTML subcommand
487
+ parser_azure_overlay = subparsers.add_parser("azure-overlay-html", help="Generate HTML overlay for Azure Document Intelligence output (words)")
488
+ parser_azure_overlay.add_argument('--image', required=True, help='Path to scanned image file')
489
+ parser_azure_overlay.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
490
+ parser_azure_overlay.add_argument('--output', required=True, help='Path to output HTML file')
491
+
492
+ args = parser.parse_args()
493
+
494
+ if args.command == "overlay-html":
495
+ output_html = args.output or (os.path.splitext(args.image_file)[0] + "_overlay.html")
496
+ generate_docling_overlay(args.image_file, args.json_file, output_html)
497
+ return
498
+
499
+ if args.command == "stitch-text":
500
+ stitch_text_from_json(args.json_file, gpt_fix=getattr(args, 'gpt_fix', False))
501
+ return
502
+
503
+ if args.command == "azure-baseline":
504
+ extract_with_azure(
505
+ args.input_files,
506
+ args.output_dir,
507
+ output_format=args.format,
508
+ verbose=True
509
+ )
510
+ return
511
+
512
+ if args.command == "azure-overlay-html":
513
+ generate_azure_overlay_html(args.image, args.json, args.output)
514
+ return
515
+
516
+ # Default: process
517
+ valid_files = []
518
+ for file_path in args.input_files:
519
+ if not os.path.exists(file_path):
520
+ print(f"Warning: File not found: {file_path}", file=sys.stderr)
521
+ else:
522
+ valid_files.append(file_path)
523
+ if not valid_files:
524
+ print("Error: No valid input files provided", file=sys.stderr)
525
+ sys.exit(1)
526
+ process_files(valid_files, args.output_dir, output_format=args.format, debug=args.debug)
527
+
528
+
529
+ if __name__ == '__main__':
530
+ main()
smoldocling/overlays.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from PIL import Image
4
+ import os
5
+ import base64
6
+
7
+ HTML_TEMPLATE = '''<!DOCTYPE html>
8
+ <html lang="en">
9
+ <head>
10
+ <meta charset="UTF-8">
11
+ <title>Document Overlay</title>
12
+ <style>
13
+ .overlay-container {{
14
+ position: relative;
15
+ width: {img_width}px;
16
+ height: {img_height}px;
17
+ background: url('{img_src}') no-repeat;
18
+ background-size: 100% 100%;
19
+ border: 1px solid #ccc;
20
+ }}
21
+ .word-box {{
22
+ position: absolute;
23
+ border: 1px solid #e74c3c;
24
+ background: rgba(255,255,0,0.3);
25
+ font-size: 12px;
26
+ color: #222;
27
+ padding: 0;
28
+ margin: 0;
29
+ line-height: 1;
30
+ pointer-events: none;
31
+ white-space: pre;
32
+ overflow: hidden;
33
+ }}
34
+ </style>
35
+ </head>
36
+ <body>
37
+ <div class="overlay-container">
38
+ {boxes}
39
+ </div>
40
+ </body>
41
+ </html>
42
+ '''
43
+
44
+ def load_image_size(image_path):
45
+ with Image.open(image_path) as img:
46
+ return img.width, img.height
47
+
48
+ def extract_words(json_data):
49
+ # Azure Document Intelligence v4 layout: words are in pages[x]['words']
50
+ words = []
51
+ for page in json_data.get('pages', []):
52
+ for word in page.get('words', []):
53
+ text = word.get('content', '')
54
+ polygon = word.get('polygon', [])
55
+ if len(polygon) == 8: # 4 points (x0,y0,...,x3,y3)
56
+ words.append({'text': text, 'polygon': polygon})
57
+ return words
58
+
59
+ def polygon_to_bbox(polygon):
60
+ xs = polygon[0::2]
61
+ ys = polygon[1::2]
62
+ x_min, x_max = min(xs), max(xs)
63
+ y_min, y_max = min(ys), max(ys)
64
+ return x_min, y_min, x_max, y_max
65
+
66
+ def scale_polygon(polygon, scale_x, scale_y):
67
+ return [polygon[i] * (scale_x if i % 2 == 0 else scale_y) for i in range(8)]
68
+
69
+ def generate_azure_overlay_html(image_path, json_path, output_path):
70
+ # Load image size
71
+ img_width, img_height = load_image_size(image_path)
72
+
73
+ # Load JSON
74
+ with open(json_path, 'r') as f:
75
+ data = json.load(f)
76
+
77
+ # Get page dimensions from JSON (assume first page)
78
+ page = data['pages'][0]
79
+ doc_width = page.get('width', img_width)
80
+ doc_height = page.get('height', img_height)
81
+ unit = page.get('unit', 'pixel')
82
+
83
+ # Compute scaling factors
84
+ scale_x = img_width / doc_width
85
+ scale_y = img_height / doc_height
86
+
87
+ # Extract words
88
+ words = extract_words(data)
89
+
90
+ # Generate HTML boxes
91
+ boxes = []
92
+ for word in words:
93
+ poly = word['polygon']
94
+ scaled_poly = scale_polygon(poly, scale_x, scale_y)
95
+ x0, y0, x2, y2 = scaled_poly[0], scaled_poly[1], scaled_poly[4], scaled_poly[5]
96
+ left = x0
97
+ top = y0
98
+ width = x2 - x0
99
+ height = y2 - y0
100
+ # Fallback for negative width/height
101
+ width = abs(width)
102
+ height = abs(height)
103
+ style = f"left:{left:.2f}px;top:{top:.2f}px;width:{width:.2f}px;height:{height:.2f}px;"
104
+ box_html = f'<span class="word-box" style="{style}">{word["text"]}</span>'
105
+ boxes.append(box_html)
106
+
107
+ # Use relative path for image in HTML
108
+ img_src = os.path.relpath(image_path, os.path.dirname(output_path))
109
+
110
+ html = HTML_TEMPLATE.format(
111
+ img_width=img_width,
112
+ img_height=img_height,
113
+ img_src=img_src,
114
+ boxes='\n'.join(boxes)
115
+ )
116
+
117
+ with open(output_path, 'w') as f:
118
+ f.write(html)
119
+ print(f"Overlay HTML written to {output_path}")
120
+
121
+ def generate_docling_overlay(image_path, json_path, output_path):
122
+ """
123
+ Generate an HTML file overlaying bounding boxes from the JSON on the image, with tooltips showing the extracted text on hover.
124
+ Returns the HTML content as a string.
125
+ """
126
+ # Load image and encode as base64
127
+ with open(image_path, "rb") as img_f:
128
+ img_bytes = img_f.read()
129
+ img_b64 = base64.b64encode(img_bytes).decode("utf-8")
130
+ from PIL import Image as PILImage
131
+ img = PILImage.open(image_path)
132
+ img_width, img_height = img.size
133
+
134
+ # Load JSON
135
+ with open(json_path, "r") as f:
136
+ doc = json.load(f)
137
+
138
+ # Collect bounding boxes and texts
139
+ boxes = []
140
+ # Texts: red
141
+ for text in doc.get("texts", []):
142
+ for prov in text.get("prov", []):
143
+ bbox = prov.get("bbox")
144
+ if bbox:
145
+ l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
146
+ boxes.append({
147
+ "l": l,
148
+ "t": t,
149
+ "r": r,
150
+ "b": b,
151
+ "text": text.get("text", ""),
152
+ "type": "text"
153
+ })
154
+ # Pictures: green
155
+ for pic in doc.get("pictures", []):
156
+ for prov in pic.get("prov", []):
157
+ bbox = prov.get("bbox")
158
+ if bbox:
159
+ l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
160
+ boxes.append({
161
+ "l": l,
162
+ "t": t,
163
+ "r": r,
164
+ "b": b,
165
+ "text": pic.get("label", "picture"),
166
+ "type": "picture"
167
+ })
168
+ # Groups: blue (enclosing all children)
169
+ def get_bbox_for_refs(refs, texts_by_ref):
170
+ # Get all bboxes for the referenced texts (recursively for groups)
171
+ bboxes = []
172
+ for ref in refs:
173
+ if ref["$ref"].startswith("#/texts/"):
174
+ text = texts_by_ref.get(ref["$ref"])
175
+ if text:
176
+ for prov in text.get("prov", []):
177
+ bbox = prov.get("bbox")
178
+ if bbox:
179
+ bboxes.append(bbox)
180
+ elif ref["$ref"].startswith("#/groups/"):
181
+ group = groups_by_ref.get(ref["$ref"])
182
+ if group:
183
+ bboxes.extend(get_bbox_for_refs(group.get("children", []), texts_by_ref))
184
+ return bboxes
185
+ groups_by_ref = {g["self_ref"]: g for g in doc.get("groups", [])}
186
+ texts_by_ref = {t["self_ref"]: t for t in doc.get("texts", [])}
187
+ for group in doc.get("groups", []):
188
+ bboxes = get_bbox_for_refs(group.get("children", []), texts_by_ref)
189
+ if bboxes:
190
+ l = min(b["l"] for b in bboxes)
191
+ t = min(b["t"] for b in bboxes)
192
+ r = max(b["r"] for b in bboxes)
193
+ b_ = max(b["b"] for b in bboxes)
194
+ boxes.append({
195
+ "l": l,
196
+ "t": t,
197
+ "r": r,
198
+ "b": b_,
199
+ "text": group.get("label", "group"),
200
+ "type": "group"
201
+ })
202
+ # Build HTML as a list of lines
203
+ html_lines = [
204
+ '<!DOCTYPE html>',
205
+ '<html lang="en">',
206
+ '<head>',
207
+ '<meta charset="UTF-8">',
208
+ f'<title>Overlay for {os.path.basename(image_path)}</title>',
209
+ '<style>',
210
+ f'''.container {{
211
+ position: relative;
212
+ width: {img_width}px;
213
+ height: {img_height}px;
214
+ background: #222;
215
+ }}
216
+ .overlay-img {{
217
+ display: block;
218
+ width: {img_width}px;
219
+ height: {img_height}px;
220
+ }}
221
+ .bbox {{
222
+ position: absolute;
223
+ box-sizing: border-box;
224
+ cursor: pointer;
225
+ }}
226
+ .bbox-text {{
227
+ border: 2px solid red;
228
+ }}
229
+ .bbox-picture {{
230
+ border: 2px solid green;
231
+ }}
232
+ .bbox-group {{
233
+ border: 2px solid blue;
234
+ }}
235
+ .tooltip {{
236
+ display: none;
237
+ position: absolute;
238
+ background: #fff;
239
+ color: #222;
240
+ border: 1px solid #888;
241
+ padding: 6px 10px;
242
+ border-radius: 4px;
243
+ z-index: 10;
244
+ pointer-events: none;
245
+ max-width: 400px;
246
+ font-size: 15px;
247
+ box-shadow: 0 2px 8px rgba(0,0,0,0.2);
248
+ white-space: pre-line;
249
+ }}''',
250
+ '</style>',
251
+ '</head>',
252
+ '<body>',
253
+ f'<h2>Overlay for {os.path.basename(image_path)}</h2>',
254
+ f'<div class="container" id="img-container">',
255
+ f' <img src="data:image/png;base64,{img_b64}" class="overlay-img" alt="source image">'
256
+ ]
257
+ # Add bounding boxes
258
+ for i, box in enumerate(boxes):
259
+ left = box["l"]
260
+ top = box["t"]
261
+ width = box["r"] - box["l"]
262
+ height = box["b"] - box["t"]
263
+ text = box["text"].replace('"', '&quot;').replace("'", "&#39;")
264
+ box_class = f"bbox bbox-{box['type']}"
265
+ html_lines.append(f'<div class="{box_class}" style="left:{left}px;top:{top}px;width:{width}px;height:{height}px;" data-tooltip="{text}" onmousemove="showTooltip(event, {i})" onmouseleave="hideTooltip()"></div>')
266
+ html_lines.append('<div class="tooltip" id="tooltip"></div>')
267
+ html_lines.append('</div>')
268
+ html_lines.append('''<script>
269
+ const tooltip = document.getElementById('tooltip');
270
+ function showTooltip(e, idx) {
271
+ const bbox = e.target;
272
+ const text = bbox.getAttribute('data-tooltip');
273
+ tooltip.innerText = text;
274
+ tooltip.style.display = 'block';
275
+ // Position tooltip near mouse, but inside container
276
+ const container = document.getElementById('img-container');
277
+ let x = e.clientX - container.getBoundingClientRect().left + 10;
278
+ let y = e.clientY - container.getBoundingClientRect().top + 10;
279
+ // Clamp to container
280
+ x = Math.min(x, container.offsetWidth - tooltip.offsetWidth - 10);
281
+ y = Math.min(y, container.offsetHeight - tooltip.offsetHeight - 10);
282
+ tooltip.style.left = x + 'px';
283
+ tooltip.style.top = y + 'px';
284
+ }
285
+ function hideTooltip() {
286
+ tooltip.style.display = 'none';
287
+ }
288
+ </script>''')
289
+ html_lines.append('</body></html>')
290
+ html = '\n'.join(html_lines)
291
+ with open(output_path, "w", encoding="utf-8") as f:
292
+ f.write(html)
293
+ print(f"Overlay HTML written to {output_path}")
294
+ return html
295
+
296
+ def main():
297
+ parser = argparse.ArgumentParser(description="Generate HTML overlay for Azure Document Intelligence output.")
298
+ parser.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
299
+ parser.add_argument('--image', required=True, help='Path to scanned image file')
300
+ parser.add_argument('--output', required=True, help='Path to output HTML file')
301
+ args = parser.parse_args()
302
+ generate_azure_overlay_html(args.image, args.json, args.output)
303
+
304
+ if __name__ == '__main__':
305
+ main()
smoldocling/pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "smoldocling"
7
+ version = "0.1.0"
8
+ description = "CLI tool for processing document images using Smoldocling"
9
+ authors = [
10
+ {name = "Your Name", email = "[email protected]"},
11
+ ]
12
+ dependencies = [
13
+ "docling-core",
14
+ # "mlx-vlm",
15
+ "Pillow>=10.0.0",
16
+ "pdf2image>=1.16.3",
17
+ "mcp[cli]>=1.7.0",
18
+ "fastapi[standard]>=0.115.12",
19
+ "torch>=2.7.0",
20
+ "openai>=1.78.1",
21
+ ]
22
+ requires-python = ">=3.10"
23
+
24
+ [project.scripts]
25
+ smoldocling = "smoldocling.cli:main"
26
+
27
+ [tool.setuptools]
28
+ packages = ["smoldocling"]
smoldocling/server.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mcp.server.fastmcp import FastMCP
2
+ from fastapi import FastAPI
3
+ from smoldocling.cli import process_files
4
+ import json
5
+
6
+ # app = FastAPI()
7
+ mcp = FastMCP("Smoldocling Document Extractor")
8
+
9
+ @mcp.tool()
10
+ def extract_document(file_path: str) -> dict:
11
+ """
12
+ Extract text and structure from a document at the given file path.
13
+ Returns a dictionary with the extracted document information as JSON.
14
+ """
15
+ result = process_files([file_path], output_dir=None, output_format="json", verbose=False)
16
+ return result
17
+
18
+ # app.mount("/mcp", mcp)
19
+ if __name__ == "__main__":
20
+ mcp.run(transport='stdio')
smoldocling/test_server.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from mcp import ClientSession, StdioServerParameters
3
+ from mcp.client.stdio import stdio_client
4
+ import os
5
+
6
+ async def main():
7
+ # Start the server using the same command as before
8
+ server_params = StdioServerParameters(
9
+ command="python",
10
+ args=["smoldocling/server.py"],
11
+ env=os.environ.copy(),
12
+ )
13
+
14
+ async with stdio_client(server_params) as (read, write):
15
+ async with ClientSession(read, write) as session:
16
+ # Initialize the connection
17
+ await session.initialize()
18
+
19
+ # List available tools
20
+ tools = await session.list_tools()
21
+ print("[DEBUG] Available tools:", tools)
22
+
23
+ # Call the extract_document tool
24
+ result = await session.call_tool(
25
+ "extract_document",
26
+ arguments={"file_path": "input/p2.png"}
27
+ )
28
+ print("[DEBUG] extract_document result:", result)
29
+
30
+ if __name__ == "__main__":
31
+ asyncio.run(main())
smoldocling/testrun.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('../')
3
+ from smoldocling import cli
4
+ from ipywidgets import HTML
5
+ import dotenv
6
+
7
+ output_png = '../data/legislatures/AZ_h_1913_apr_special_p9.png'
8
+ output_dir = '../output/'
9
+
10
+ cli.process_files([output_png], output_dir, output_format="json")
11
+
12
+ fileName = output_png[output_png.rfind("/")+1:].replace(".png",'')
13
+ json_output = output_dir + fileName + ".json"
14
+ overlay_html = output_dir + fileName + "_overlay.html"
15
+
16
+ html_output = cli.generate_docling_overlay(output_png, json_output, overlay_html)
17
+
18
+ dotenv.load_dotenv()
19
+
20
+ cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False)