Spaces:

jobian
/

smoldocling-api

Sleeping

App Files Files Community

jobian commited on Jun 23

Commit

ceaf2e8

1 Parent(s): c6954d9

Added Smoldocling Package and implemeted it's first test /parse

Browse files

Files changed (11) hide show

Dockerfile +10 -10
app.py +57 -4
requirements.txt +2 -0
smoldocling/README.md +54 -0
smoldocling/__init__.py +5 -0
smoldocling/cli.py +530 -0
smoldocling/overlays.py +305 -0
smoldocling/pyproject.toml +28 -0
smoldocling/server.py +20 -0
smoldocling/test_server.py +31 -0
smoldocling/testrun.py +20 -0

Dockerfile CHANGED Viewed

@@ -1,16 +1,16 @@
-# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
-# you will also find guides on how best to write your Dockerfile
-FROM python:3.9
-RUN useradd -m -u 1000 user
-USER user
-ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
-COPY --chown=user ./requirements.txt requirements.txt
-RUN pip install --no-cache-dir --upgrade -r requirements.txt
-COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Dockerfile
+FROM python:3.10-slim
 WORKDIR /app
+# Copy everything including smoldocling
+COPY . .
+# Install requirements (editable install of local package)
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install -e ./smoldocling
+EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,8 +1,61 @@
-from fastapi import FastAPI
 app = FastAPI()
-@app.get("/")
-def greet_json():
-    return {"Hello": "World!"}

+import os
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List
+from smoldocling import cli
+import shutil
+import dotenv
+# Load environment variables
+dotenv.load_dotenv()
+# Initialize FastAPI app
 app = FastAPI()
+# Enable CORS (optional, but good for dev/testing)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Ensure directories exist
+UPLOAD_DIR = "uploads"
+OUTPUT_DIR = "output"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def docling_process_files(file_list: List[str]) -> str:
+    cli.process_files(file_list, OUTPUT_DIR, output_format='json')
+    file_path = file_list[0].replace('\\', '/')
+    file_name = os.path.splitext(os.path.basename(file_path))[0]
+    json_output = os.path.join(OUTPUT_DIR, f"{file_name}.json")
+    overlay_html = os.path.join(OUTPUT_DIR, f"{file_name}_overlay.html")
+    # Generate overlay (optional)
+    cli.generate_docling_overlay(file_path, json_output, overlay_html)
+    # Stitch final cleaned text (you can toggle GPT fixing)
+    cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False)
+    return cleaned_text
+@app.post("/parse")
+async def parse_docling(file: UploadFile = File(...)):
+    if not file:
+        raise HTTPException(status_code=400, detail="No file uploaded.")
+    save_path = os.path.join(UPLOAD_DIR, file.filename)
+    with open(save_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    try:
+        text_output = docling_process_files([save_path])
+        return JSONResponse(content={"text": text_output})
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 fastapi
 uvicorn[standard]

 fastapi
 uvicorn[standard]
+python-multipart
+-e ./smoldocling

smoldocling/README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# Smoldocling CLI
+A command-line interface for processing document images and PDFs using Smoldocling-256-preview model.
+## Installation
+1. Clone this repository
+2. Install the required dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+## Usage
+The CLI supports processing one or multiple document images and PDFs at once. The processed output will be saved as HTML files.
+Basic usage:
+```bash
+python smoldocling_cli.py input1.png input2.jpg input3.pdf
+```
+Specify output directory:
+```bash
+python smoldocling_cli.py -o custom_output input1.png document.pdf
+```
+### Arguments
+- `input_files`: One or more input files (images or PDFs) to process
+- `-o, --output-dir`: Output directory for HTML files (default: 'output')
+### Example
+```bash
+python smoldocling_cli.py document1.png document2.pdf -o processed_docs
+```
+This will:
+1. Process document1.png and generate document1.html
+2. Process document2.pdf and generate document2.html (containing all pages in a single file)
+3. Create a directory called 'processed_docs' if it doesn't exist
+4. Save all HTML files in the processed_docs directory
+## Notes
+- The script will automatically create the output directory if it doesn't exist
+- Each input image file will generate a corresponding HTML file with the same name (but .html extension)
+- PDF files will generate a single HTML file containing all pages
+- Currently, PDF processing is limited to the first 3 pages due to model limitations
+- Failed processing of one file won't stop the processing of other files
+- Error messages will be printed to stderr
+- The model is loaded only once for processing multiple files

smoldocling/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+Smoldocling CLI package for processing document images.
+"""
+__version__ = "0.1.0"

smoldocling/cli.py ADDED Viewed

	@@ -0,0 +1,530 @@

+#!/usr/bin/env python3
+import argparse
+import os
+from pathlib import Path
+from PIL import Image, ImageDraw
+from docling_core.types.doc import DoclingDocument, ImageRefMode
+from docling_core.types.doc.document import DocTagsDocument
+import torch
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+import sys
+from pdf2image import convert_from_path
+import tempfile
+import json
+import matplotlib.pyplot as plt
+from pprint import pprint
+import base64
+from dotenv import load_dotenv
+import openai
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.core.credentials import AzureKeyCredential
+from smoldocling.overlays import generate_azure_overlay_html, generate_docling_overlay
+from PIL import Image
+import requests
+from io import BytesIO
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+load_dotenv()
+def load_model(verbose=True):
+    """Load the Smoldocling model and return model and processor."""
+    if verbose:
+        print("Loading Smoldocling model...")
+    model_path = "ds4sd/SmolDocling-256M-preview"
+    processor = AutoProcessor.from_pretrained(model_path)
+    model = AutoModelForVision2Seq.from_pretrained(
+        model_path,
+        torch_dtype=torch.float16,  # Use float16 for T4 GPU
+    ).to(DEVICE)
+    return model, processor
+def run_model(model, processor, image, prompt="Convert this page to docling.", verbose=True):
+    """Run the Smoldocling model with the given image and prompt and return the doctags."""
+    # Prepare inputs
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": prompt}
+            ]
+        },
+    ]
+    formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(
+        text=formatted_prompt,
+        images=[image],
+        return_tensors="pt",
+        truncation=True,  # ✅ Avoid truncation warning
+    ).to(DEVICE)
+    # Generate output
+    if verbose:
+        print("Generating text...")
+    generated_ids = model.generate(**inputs, max_new_tokens=8192)
+    prompt_length = inputs.input_ids.shape[1]
+    trimmed_generated_ids = generated_ids[:, prompt_length:]
+    return processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
+def extract_text_from_document(image_path, model, processor, output_format="html", verbose=True):
+    """Extract text from a document image using Smoldocling-256."""
+    try:
+        # Load and preprocess the image
+        image = Image.open(image_path)
+        if verbose:
+            print(f"Processing {image_path}")
+            print(f"Image mode: {image.mode}")
+            print(f"Image size: {image.size}")
+        # Run docling vlm
+        output = run_model(model, processor, image, verbose=verbose)
+        doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+            [output],
+            [image]
+        )
+        doc = DoclingDocument(name=Path(image_path).stem).load_from_doctags(doctags_doc)
+        # Handle formatting and export
+        if output_format == "json":
+            # Export to dict (no images)
+            doc_dict = doc.export_to_dict()
+            # Remove images from the dict if present
+            if "pictures" in doc_dict:
+                for picture in doc_dict["pictures"]:
+                    if "image" in picture:
+                        if "uri" in picture["image"]:
+                            del picture["image"]["uri"]
+            return doc_dict
+        else:
+            html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
+            return html_output
+    except Exception as e:
+        if verbose:
+            print(f"Error processing 1: {image_path}: {str(e)}", file=sys.stderr)
+        return None
+def process_pdf(pdf_path, model, processor, output_dir, output_format="html", debug=False, verbose=True):
+    """Process a PDF file by converting it to images and processing each page."""
+    try:
+        if verbose:
+            print(f"\nProcessing PDF: {pdf_path}")
+        # Convert PDF to images
+        with tempfile.TemporaryDirectory() as temp_dir:
+            if verbose:
+                print("Converting PDF to images...")
+                # TODO: Review this. It's not working when the PDF is large.
+            images = convert_from_path(
+                pdf_path,
+                output_folder=temp_dir,
+                first_page=1,
+                fmt="png"
+            )
+            if not images:
+                if verbose:
+                    print(f"No pages found in PDF: {pdf_path}", file=sys.stderr)
+                return
+            all_doctags = []
+            all_images = []
+            for i, image in enumerate(images, start=1):
+                image_path = os.path.join(temp_dir, f"page_{i}.png")
+                image.save(image_path, "PNG")
+                if verbose:
+                    print(f"\nProcessing page {i}")
+                try:
+                    image = Image.open(image_path)
+                    if verbose:
+                        print(f"Processing {image_path}")
+                        print(f"Image mode: {image.mode}")
+                        print(f"Image size: {image.size}")
+                    output = run_model(model, processor, image, verbose=verbose)
+                    cleaned_output = output.replace("<end_of_utterance>", "").strip()
+                    # If you have charts:
+                    if "<chart>" in cleaned_output:
+                        cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                    all_doctags.append(cleaned_output)
+                    all_images.append(image)
+                    if verbose:
+                        print(f"Successfully processed page {i}")
+                    # DEBUG: Dump per-page JSON if requested
+                    if debug and output_dir is not None:
+                        # Create a single-page DocTagsDocument and DoclingDocument
+                        doctags_doc_page = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], [image])
+                        doc_page = DoclingDocument(name=f"{Path(pdf_path).stem}_p{i}")
+                        doc_page.load_from_doctags(doctags_doc_page)
+                        doc_dict_page = doc_page.export_to_dict()
+                        # Remove images from the dict if present
+                        if "pages" in doc_dict_page:
+                            for page in doc_dict_page["pages"]:
+                                if "image" in page:
+                                    page["image"] = None
+                        page_json_path = Path(output_dir) / f"{Path(pdf_path).stem}_p{i}.json"
+                        with open(page_json_path, 'w', encoding='utf-8') as f:
+                            json.dump(doc_dict_page, f, ensure_ascii=False, indent=2)
+                        if verbose:
+                            print(f"[DEBUG] Dumped page {i} JSON to {page_json_path}")
+                except Exception as e:
+                    if verbose:
+                        print(f"Error processing page {i}: {str(e)}", file=sys.stderr)
+            if all_doctags and all_images:
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+                    all_doctags,
+                    all_images
+                )
+                doc = DoclingDocument(name=Path(pdf_path).stem)
+                doc.load_from_doctags(doctags_doc)
+                if output_format == "json":
+                    doc_dict = doc.export_to_dict()
+                    if "pages" in doc_dict:
+                        for page in doc_dict["pages"]:
+                            if "image" in page:
+                                page["image"] = None
+                    if output_dir is None:
+                        return doc_dict
+                    output_filename = f"{Path(pdf_path).stem}.json"
+                    output_path = Path(output_dir) / output_filename
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        json.dump(doc_dict, f, ensure_ascii=False, indent=2)
+                    if verbose:
+                        print(f"\nSuccessfully saved combined output to {output_path}")
+                else:
+                    html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
+                    if output_dir is None:
+                        return html_output
+                    output_filename = f"{Path(pdf_path).stem}.html"
+                    output_path = Path(output_dir) / output_filename
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        f.write(html_output)
+                    if verbose:
+                        print(f"\nSuccessfully saved combined output to {output_path}")
+            else:
+                if verbose:
+                    print("No pages were successfully processed", file=sys.stderr)
+    except Exception as e:
+        if verbose:
+            print(f"Error processing PDF {pdf_path}: {str(e)}", file=sys.stderr)
+def process_files(input_files, output_dir, output_format="html", debug=False, verbose=True):
+    """Process multiple input files and generate outputs in the specified format."""
+    if output_dir is not None:
+        os.makedirs(output_dir, exist_ok=True)
+    model, processor = load_model(verbose=verbose)
+    results = []
+    for input_file in input_files:
+        try:
+            input_path = Path(input_file)
+            if input_path.suffix.lower() == '.pdf':
+                if output_dir is None:
+                    # Collect results instead of writing to files
+                    pdf_result = process_pdf(input_file, model, processor, None, output_format=output_format, debug=debug, verbose=verbose)
+                    if pdf_result:
+                        results.extend(pdf_result)
+                else:
+                    process_pdf(input_file, model, processor, output_dir, output_format=output_format, debug=debug, verbose=verbose)
+            else:
+                if verbose:
+                    print(f"\nProcessing: {input_file}")
+            doc_dict = extract_text_from_document(input_path, model, processor, output_format=output_format, verbose=verbose)
+            if doc_dict:
+                if output_dir is None:
+                    results.append(doc_dict)
+                else:
+                    output_path = Path(output_dir) / f"{input_path.stem}.{output_format}"
+                    if verbose:
+                        print(f"Output will be saved to: {output_path}")
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        if output_format == "json":
+                            json.dump(doc_dict, f, ensure_ascii=False, indent=2)
+                        elif output_format == "html":
+                            f.write(doc_dict)
+                if verbose:
+                    print(f"Successfully processed {input_file}")
+            else:
+                if verbose:
+                    print(f"Failed to process {input_file}", file=sys.stderr)
+        except Exception as e:
+            if verbose:
+                print(f"Error processing 2 {input_file}: {str(e)}", file=sys.stderr)
+    if output_dir is None:
+        return results
+def visualize_doc(doc_path, page_num=0):
+    """
+    Visualize a document (PDF or image) with bounding boxes from its corresponding JSON annotation.
+    Args:
+        doc_path (str): Path to the input document file (PDF or image)
+        page_num (int): Page number to visualize for PDFs (default 0)
+    """
+    # Load document
+    if doc_path.lower().endswith('.pdf'):
+        # Handle PDF with pdf2image
+        # pdf_doc = fitz.open(doc_path)
+        # page = pdf_doc[page_num]
+        # pix = page.get_pixmap()
+        # image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images = convert_from_path(doc_path, first_page=1)
+        image = images[page_num]
+    else:
+        # Handle image
+        image = Image.open(doc_path).convert("RGB")
+    # Load corresponding JSON
+    json_path = doc_path.replace("input", "output").replace(".png", ".json")
+    if doc_path.lower().endswith('.pdf'):
+        # For PDFs, append page number to JSON filename
+        json_path = json_path.replace(".pdf", f"_p{page_num+1}.json")
+    with open(json_path, "r") as f:
+        doc = json.load(f)
+    # Collect all bounding boxes from texts and pictures
+    bboxes = []
+    labels = []
+    for text in doc.get("texts", []):
+        for prov in text.get("prov", []):
+            # Only process boxes from specified page for PDFs
+            # if doc_path.lower().endswith('.pdf') and prov.get("page_no") != page_num + 1:
+            if doc_path.lower().endswith('.pdf') and prov.get("page_no") != 1: # currently only works for first page
+                continue
+            bbox = prov.get("bbox")
+            if bbox:
+                bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
+                labels.append(text.get("label", ""))
+    for pic in doc.get("pictures", []):
+        for prov in pic.get("prov", []):
+            # Only process boxes from specified page for PDFs
+            # if doc_path.lower().endswith('.pdf') and prov.get("page_no") != page_num + 1:
+            if doc_path.lower().endswith('.pdf') and prov.get("page_no") != 1: # currently only works for first page
+                continue
+            bbox = prov.get("bbox")
+            if bbox:
+                bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
+                labels.append(pic.get("label", "picture"))
+    for table in doc.get("tables", []):
+        for prov in table.get("prov", []):
+            bbox = prov.get("bbox")
+            if bbox:
+                bboxes.append([bbox["l"], bbox["t"], bbox["r"], bbox["b"]])
+                labels.append(table.get("label", ""))
+    # Draw bounding boxes
+    draw = ImageDraw.Draw(image)
+    for (l, t, r, b), label in zip(bboxes, labels):
+        draw.rectangle([l, t, r, b], outline="red", width=2)
+        if label:
+            draw.text((l, t-10), f"{label} ({l:.1f}, {t:.1f}, {r:.1f}, {b:.1f})", fill="red")
+    # Display
+    plt.figure(figsize=(10, 12))
+    plt.imshow(image)
+    plt.axis("off")
+    plt.show()
+def stitch_text_from_json(json_path, gpt_fix=False):
+    """
+    Given a JSON file in the DoclingDocument format, stitch together all text fragments in the order specified in the body and group sections.
+    Print the result as plain text. Optionally send to GPT to fix line breaks and hyphenation.
+    Returns the stitched (and optionally cleaned) text as a string.
+    """
+    stitched_text = None
+    with open(json_path, 'r', encoding='utf-8') as f:
+        doc = json.load(f)
+    texts = doc.get('texts', [])
+    groups = doc.get('groups', [])
+    body = doc.get('body', {})
+    # Build lookup tables
+    texts_by_ref = {f"#/texts/{i}": t for i, t in enumerate(texts)}
+    groups_by_ref = {g['self_ref']: g for g in groups}
+    def extract_texts(children):
+        result = []
+        for child in children:
+            ref = child.get('$ref')
+            if ref is None:
+                continue
+            if ref.startswith('#/texts/'):
+                text_obj = texts_by_ref.get(ref)
+                if text_obj:
+                    text = text_obj.get('text', '')
+                    if text:
+                        result.append(text)
+            elif ref.startswith('#/groups/'):
+                group_obj = groups_by_ref.get(ref)
+                if group_obj:
+                    result.extend(extract_texts(group_obj.get('children', [])))
+        return result
+    stitched_texts = extract_texts(body.get('children', []))
+    final_text = '\n'.join(stitched_texts)
+    if gpt_fix:
+        try:
+            api_key = os.environ.get('OPENAI_API_KEY')
+            if not api_key:
+                print("OPENAI_API_KEY not set. Printing original stitched text.", file=sys.stderr)
+                print(final_text)
+                return final_text
+            client = openai.OpenAI(api_key=api_key)
+            prompt = (
+                "You are a helpful assistant. "
+                "The following text was extracted from a document and may contain odd line breaks, hyphenated words split across lines, or other OCR artifacts. "
+                "Please rewrite the text as clean, readable prose, fixing line breaks, joining hyphenated words, and correcting obvious errors, but do not add or remove content.\n\n"
+                f"Text to fix:\n\n{final_text}\n\nCleaned text:"
+            )
+            response = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=4096,
+                temperature=0.0,
+            )
+            cleaned_text = response.choices[0].message.content.strip()
+            print(cleaned_text)
+            return cleaned_text
+        except Exception as e:
+            print(f"[GPT-fix error] {e}. Printing original stitched text.", file=sys.stderr)
+            print(final_text)
+            return final_text
+    else:
+        print(final_text)
+        return final_text
+def extract_with_azure(input_files, output_dir, output_format="json", verbose=True):
+    endpoint = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
+    key = os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_KEY")
+    if not endpoint or not key:
+        print("Azure endpoint/key not set. Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and AZURE_DOCUMENT_INTELLIGENCE_KEY in your environment.", file=sys.stderr)
+        return
+    client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))
+    os.makedirs(output_dir, exist_ok=True)
+    for input_file in input_files:
+        with open(input_file, "rb") as f:
+            file_bytes = f.read()
+            poller = client.begin_analyze_document(
+                model_id="prebuilt-layout",
+                body={"base64Source": base64.b64encode(file_bytes).decode("utf-8")}
+            )
+            result = poller.result()
+            output_path = Path(output_dir) / (Path(input_file).stem + ".json")
+            with open(output_path, "w", encoding="utf-8") as out_f:
+                json.dump(result.as_dict(), out_f, ensure_ascii=False, indent=2)
+            if verbose:
+                print(f"Azure baseline output saved to {output_path}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process document images and PDFs using Smoldocling and generate HTML or JSON outputs"
+    )
+    subparsers = parser.add_subparsers(dest="command", required=False)
+    # Default parser for main processing
+    parser_main = subparsers.add_parser("process", help="Process images or PDFs to HTML/JSON (default)")
+    parser_main.add_argument(
+        'input_files', nargs='+', help='One or more input files (images or PDFs) to process'
+    )
+    parser_main.add_argument(
+        '-o', '--output-dir', default='output', help='Output directory for result files (default: output)'
+    )
+    parser_main.add_argument(
+        '--format', choices=['html', 'json'], default='html', help='Output format: html or json (default: html)'
+    )
+    parser_main.add_argument(
+        '--debug', action='store_true', help='Enable debug mode: dump each PDF page as a separate JSON file.'
+    )
+    # Overlay HTML subcommand
+    parser_overlay = subparsers.add_parser("overlay-html", help="Generate HTML overlay from PNG and JSON")
+    parser_overlay.add_argument('image_file', help='Source PNG image file')
+    parser_overlay.add_argument('json_file', help='Extracted JSON file with bounding boxes')
+    parser_overlay.add_argument('-o', '--output', help='Output HTML file (default: <image_file>_overlay.html)')
+    # Stitch text subcommand
+    parser_stitch = subparsers.add_parser("stitch-text", help="Stitch together text fragments from a JSON file and print as plain text")
+    parser_stitch.add_argument('json_file', help='Extracted JSON file to stitch')
+    parser_stitch.add_argument('--gpt-fix', action='store_true', help='Send stitched text to GPT to fix line breaks and hyphenation')
+    # Azure baseline subcommand
+    parser_azure = subparsers.add_parser(
+        "azure-baseline", help="Extract content using Azure Document Intelligence for baseline comparison"
+    )
+    parser_azure.add_argument(
+        'input_files', nargs='+', help='One or more input files (images or PDFs) to process with Azure Document Intelligence'
+    )
+    parser_azure.add_argument(
+        '-o', '--output-dir', default='output_azure', help='Output directory for Azure baseline result files (default: output_azure)'
+    )
+    parser_azure.add_argument(
+        '--format', choices=['json'], default='json', help='Output format: json (default: json)'
+    )
+    # Azure overlay HTML subcommand
+    parser_azure_overlay = subparsers.add_parser("azure-overlay-html", help="Generate HTML overlay for Azure Document Intelligence output (words)")
+    parser_azure_overlay.add_argument('--image', required=True, help='Path to scanned image file')
+    parser_azure_overlay.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
+    parser_azure_overlay.add_argument('--output', required=True, help='Path to output HTML file')
+    args = parser.parse_args()
+    if args.command == "overlay-html":
+        output_html = args.output or (os.path.splitext(args.image_file)[0] + "_overlay.html")
+        generate_docling_overlay(args.image_file, args.json_file, output_html)
+        return
+    if args.command == "stitch-text":
+        stitch_text_from_json(args.json_file, gpt_fix=getattr(args, 'gpt_fix', False))
+        return
+    if args.command == "azure-baseline":
+        extract_with_azure(
+            args.input_files,
+            args.output_dir,
+            output_format=args.format,
+            verbose=True
+        )
+        return
+    if args.command == "azure-overlay-html":
+        generate_azure_overlay_html(args.image, args.json, args.output)
+        return
+    # Default: process
+    valid_files = []
+    for file_path in args.input_files:
+        if not os.path.exists(file_path):
+            print(f"Warning: File not found: {file_path}", file=sys.stderr)
+        else:
+            valid_files.append(file_path)
+    if not valid_files:
+        print("Error: No valid input files provided", file=sys.stderr)
+        sys.exit(1)
+    process_files(valid_files, args.output_dir, output_format=args.format, debug=args.debug)
+if __name__ == '__main__':
+    main()

smoldocling/overlays.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import argparse
+import json
+from PIL import Image
+import os
+import base64
+HTML_TEMPLATE = '''<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>Document Overlay</title>
+<style>
+  .overlay-container {{
+    position: relative;
+    width: {img_width}px;
+    height: {img_height}px;
+    background: url('{img_src}') no-repeat;
+    background-size: 100% 100%;
+    border: 1px solid #ccc;
+  }}
+  .word-box {{
+    position: absolute;
+    border: 1px solid #e74c3c;
+    background: rgba(255,255,0,0.3);
+    font-size: 12px;
+    color: #222;
+    padding: 0;
+    margin: 0;
+    line-height: 1;
+    pointer-events: none;
+    white-space: pre;
+    overflow: hidden;
+  }}
+</style>
+</head>
+<body>
+<div class="overlay-container">
+{boxes}
+</div>
+</body>
+</html>
+'''
+def load_image_size(image_path):
+    with Image.open(image_path) as img:
+        return img.width, img.height
+def extract_words(json_data):
+    # Azure Document Intelligence v4 layout: words are in pages[x]['words']
+    words = []
+    for page in json_data.get('pages', []):
+        for word in page.get('words', []):
+            text = word.get('content', '')
+            polygon = word.get('polygon', [])
+            if len(polygon) == 8:  # 4 points (x0,y0,...,x3,y3)
+                words.append({'text': text, 'polygon': polygon})
+    return words
+def polygon_to_bbox(polygon):
+    xs = polygon[0::2]
+    ys = polygon[1::2]
+    x_min, x_max = min(xs), max(xs)
+    y_min, y_max = min(ys), max(ys)
+    return x_min, y_min, x_max, y_max
+def scale_polygon(polygon, scale_x, scale_y):
+    return [polygon[i] * (scale_x if i % 2 == 0 else scale_y) for i in range(8)]
+def generate_azure_overlay_html(image_path, json_path, output_path):
+    # Load image size
+    img_width, img_height = load_image_size(image_path)
+    # Load JSON
+    with open(json_path, 'r') as f:
+        data = json.load(f)
+    # Get page dimensions from JSON (assume first page)
+    page = data['pages'][0]
+    doc_width = page.get('width', img_width)
+    doc_height = page.get('height', img_height)
+    unit = page.get('unit', 'pixel')
+    # Compute scaling factors
+    scale_x = img_width / doc_width
+    scale_y = img_height / doc_height
+    # Extract words
+    words = extract_words(data)
+    # Generate HTML boxes
+    boxes = []
+    for word in words:
+        poly = word['polygon']
+        scaled_poly = scale_polygon(poly, scale_x, scale_y)
+        x0, y0, x2, y2 = scaled_poly[0], scaled_poly[1], scaled_poly[4], scaled_poly[5]
+        left = x0
+        top = y0
+        width = x2 - x0
+        height = y2 - y0
+        # Fallback for negative width/height
+        width = abs(width)
+        height = abs(height)
+        style = f"left:{left:.2f}px;top:{top:.2f}px;width:{width:.2f}px;height:{height:.2f}px;"
+        box_html = f'<span class="word-box" style="{style}">{word["text"]}</span>'
+        boxes.append(box_html)
+    # Use relative path for image in HTML
+    img_src = os.path.relpath(image_path, os.path.dirname(output_path))
+    html = HTML_TEMPLATE.format(
+        img_width=img_width,
+        img_height=img_height,
+        img_src=img_src,
+        boxes='\n'.join(boxes)
+    )
+    with open(output_path, 'w') as f:
+        f.write(html)
+    print(f"Overlay HTML written to {output_path}")
+def generate_docling_overlay(image_path, json_path, output_path):
+    """
+    Generate an HTML file overlaying bounding boxes from the JSON on the image, with tooltips showing the extracted text on hover.
+    Returns the HTML content as a string.
+    """
+    # Load image and encode as base64
+    with open(image_path, "rb") as img_f:
+        img_bytes = img_f.read()
+        img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    from PIL import Image as PILImage
+    img = PILImage.open(image_path)
+    img_width, img_height = img.size
+    # Load JSON
+    with open(json_path, "r") as f:
+        doc = json.load(f)
+    # Collect bounding boxes and texts
+    boxes = []
+    # Texts: red
+    for text in doc.get("texts", []):
+        for prov in text.get("prov", []):
+            bbox = prov.get("bbox")
+            if bbox:
+                l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
+                boxes.append({
+                    "l": l,
+                    "t": t,
+                    "r": r,
+                    "b": b,
+                    "text": text.get("text", ""),
+                    "type": "text"
+                })
+    # Pictures: green
+    for pic in doc.get("pictures", []):
+        for prov in pic.get("prov", []):
+            bbox = prov.get("bbox")
+            if bbox:
+                l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
+                boxes.append({
+                    "l": l,
+                    "t": t,
+                    "r": r,
+                    "b": b,
+                    "text": pic.get("label", "picture"),
+                    "type": "picture"
+                })
+    # Groups: blue (enclosing all children)
+    def get_bbox_for_refs(refs, texts_by_ref):
+        # Get all bboxes for the referenced texts (recursively for groups)
+        bboxes = []
+        for ref in refs:
+            if ref["$ref"].startswith("#/texts/"):
+                text = texts_by_ref.get(ref["$ref"])
+                if text:
+                    for prov in text.get("prov", []):
+                        bbox = prov.get("bbox")
+                        if bbox:
+                            bboxes.append(bbox)
+            elif ref["$ref"].startswith("#/groups/"):
+                group = groups_by_ref.get(ref["$ref"])
+                if group:
+                    bboxes.extend(get_bbox_for_refs(group.get("children", []), texts_by_ref))
+        return bboxes
+    groups_by_ref = {g["self_ref"]: g for g in doc.get("groups", [])}
+    texts_by_ref = {t["self_ref"]: t for t in doc.get("texts", [])}
+    for group in doc.get("groups", []):
+        bboxes = get_bbox_for_refs(group.get("children", []), texts_by_ref)
+        if bboxes:
+            l = min(b["l"] for b in bboxes)
+            t = min(b["t"] for b in bboxes)
+            r = max(b["r"] for b in bboxes)
+            b_ = max(b["b"] for b in bboxes)
+            boxes.append({
+                "l": l,
+                "t": t,
+                "r": r,
+                "b": b_,
+                "text": group.get("label", "group"),
+                "type": "group"
+            })
+    # Build HTML as a list of lines
+    html_lines = [
+        '<!DOCTYPE html>',
+        '<html lang="en">',
+        '<head>',
+        '<meta charset="UTF-8">',
+        f'<title>Overlay for {os.path.basename(image_path)}</title>',
+        '<style>',
+        f'''.container {{
+    position: relative;
+    width: {img_width}px;
+    height: {img_height}px;
+    background: #222;
+  }}
+  .overlay-img {{
+    display: block;
+    width: {img_width}px;
+    height: {img_height}px;
+  }}
+  .bbox {{
+    position: absolute;
+    box-sizing: border-box;
+    cursor: pointer;
+  }}
+  .bbox-text {{
+    border: 2px solid red;
+  }}
+  .bbox-picture {{
+    border: 2px solid green;
+  }}
+  .bbox-group {{
+    border: 2px solid blue;
+  }}
+  .tooltip {{
+    display: none;
+    position: absolute;
+    background: #fff;
+    color: #222;
+    border: 1px solid #888;
+    padding: 6px 10px;
+    border-radius: 4px;
+    z-index: 10;
+    pointer-events: none;
+    max-width: 400px;
+    font-size: 15px;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.2);
+    white-space: pre-line;
+  }}''',
+        '</style>',
+        '</head>',
+        '<body>',
+        f'<h2>Overlay for {os.path.basename(image_path)}</h2>',
+        f'<div class="container" id="img-container">',
+        f'  <img src="data:image/png;base64,{img_b64}" class="overlay-img" alt="source image">'
+    ]
+    # Add bounding boxes
+    for i, box in enumerate(boxes):
+        left = box["l"]
+        top = box["t"]
+        width = box["r"] - box["l"]
+        height = box["b"] - box["t"]
+        text = box["text"].replace('"', '&quot;').replace("'", "&#39;")
+        box_class = f"bbox bbox-{box['type']}"
+        html_lines.append(f'<div class="{box_class}" style="left:{left}px;top:{top}px;width:{width}px;height:{height}px;" data-tooltip="{text}" onmousemove="showTooltip(event, {i})" onmouseleave="hideTooltip()"></div>')
+    html_lines.append('<div class="tooltip" id="tooltip"></div>')
+    html_lines.append('</div>')
+    html_lines.append('''<script>
+const tooltip = document.getElementById('tooltip');
+function showTooltip(e, idx) {
+  const bbox = e.target;
+  const text = bbox.getAttribute('data-tooltip');
+  tooltip.innerText = text;
+  tooltip.style.display = 'block';
+  // Position tooltip near mouse, but inside container
+  const container = document.getElementById('img-container');
+  let x = e.clientX - container.getBoundingClientRect().left + 10;
+  let y = e.clientY - container.getBoundingClientRect().top + 10;
+  // Clamp to container
+  x = Math.min(x, container.offsetWidth - tooltip.offsetWidth - 10);
+  y = Math.min(y, container.offsetHeight - tooltip.offsetHeight - 10);
+  tooltip.style.left = x + 'px';
+  tooltip.style.top = y + 'px';
+}
+function hideTooltip() {
+  tooltip.style.display = 'none';
+}
+</script>''')
+    html_lines.append('</body></html>')
+    html = '\n'.join(html_lines)
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"Overlay HTML written to {output_path}")
+    return html
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML overlay for Azure Document Intelligence output.")
+    parser.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
+    parser.add_argument('--image', required=True, help='Path to scanned image file')
+    parser.add_argument('--output', required=True, help='Path to output HTML file')
+    args = parser.parse_args()
+    generate_azure_overlay_html(args.image, args.json, args.output)
+if __name__ == '__main__':
+    main()

smoldocling/pyproject.toml ADDED Viewed

	@@ -0,0 +1,28 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "smoldocling"
+version = "0.1.0"
+description = "CLI tool for processing document images using Smoldocling"
+authors = [
+    {name = "Your Name", email = "[email protected]"},
+]
+dependencies = [
+    "docling-core",
+    # "mlx-vlm",
+    "Pillow>=10.0.0",
+    "pdf2image>=1.16.3",
+    "mcp[cli]>=1.7.0",
+    "fastapi[standard]>=0.115.12",
+    "torch>=2.7.0",
+    "openai>=1.78.1",
+]
+requires-python = ">=3.10"
+[project.scripts]
+smoldocling = "smoldocling.cli:main"
+[tool.setuptools]
+packages = ["smoldocling"]

smoldocling/server.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from mcp.server.fastmcp import FastMCP
+from fastapi import FastAPI
+from smoldocling.cli import process_files
+import json
+# app = FastAPI()
+mcp = FastMCP("Smoldocling Document Extractor")
+@mcp.tool()
+def extract_document(file_path: str) -> dict:
+    """
+    Extract text and structure from a document at the given file path.
+    Returns a dictionary with the extracted document information as JSON.
+    """
+    result = process_files([file_path], output_dir=None, output_format="json", verbose=False)
+    return result
+# app.mount("/mcp", mcp)
+if __name__ == "__main__":
+    mcp.run(transport='stdio')

smoldocling/test_server.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import asyncio
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+import os
+async def main():
+    # Start the server using the same command as before
+    server_params = StdioServerParameters(
+        command="python",
+        args=["smoldocling/server.py"],
+        env=os.environ.copy(),
+    )
+    async with stdio_client(server_params) as (read, write):
+        async with ClientSession(read, write) as session:
+            # Initialize the connection
+            await session.initialize()
+            # List available tools
+            tools = await session.list_tools()
+            print("[DEBUG] Available tools:", tools)
+            # Call the extract_document tool
+            result = await session.call_tool(
+                "extract_document",
+                arguments={"file_path": "input/p2.png"}
+            )
+            print("[DEBUG] extract_document result:", result)
+if __name__ == "__main__":
+    asyncio.run(main())

smoldocling/testrun.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import sys
+sys.path.append('../')
+from smoldocling import cli
+from ipywidgets import HTML
+import dotenv
+output_png = '../data/legislatures/AZ_h_1913_apr_special_p9.png'
+output_dir = '../output/'
+cli.process_files([output_png], output_dir, output_format="json")
+fileName = output_png[output_png.rfind("/")+1:].replace(".png",'')
+json_output = output_dir + fileName + ".json"
+overlay_html = output_dir + fileName + "_overlay.html"
+html_output = cli.generate_docling_overlay(output_png, json_output, overlay_html)
+dotenv.load_dotenv()
+cleaned_text = cli.stitch_text_from_json(json_output, gpt_fix=False)