Spaces:

jobian
/

smoldocling-api

Running

File size: 9,870 Bytes

ceaf2e8

import argparse
import json
from PIL import Image
import os
import base64

HTML_TEMPLATE = '''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Document Overlay</title>
<style>
  .overlay-container {{
    position: relative;
    width: {img_width}px;
    height: {img_height}px;
    background: url('{img_src}') no-repeat;
    background-size: 100% 100%;
    border: 1px solid #ccc;
  }}
  .word-box {{
    position: absolute;
    border: 1px solid #e74c3c;
    background: rgba(255,255,0,0.3);
    font-size: 12px;
    color: #222;
    padding: 0;
    margin: 0;
    line-height: 1;
    pointer-events: none;
    white-space: pre;
    overflow: hidden;
  }}
</style>
</head>
<body>
<div class="overlay-container">
{boxes}
</div>
</body>
</html>
'''

def load_image_size(image_path):
    with Image.open(image_path) as img:
        return img.width, img.height

def extract_words(json_data):
    # Azure Document Intelligence v4 layout: words are in pages[x]['words']
    words = []
    for page in json_data.get('pages', []):
        for word in page.get('words', []):
            text = word.get('content', '')
            polygon = word.get('polygon', [])
            if len(polygon) == 8:  # 4 points (x0,y0,...,x3,y3)
                words.append({'text': text, 'polygon': polygon})
    return words

def polygon_to_bbox(polygon):
    xs = polygon[0::2]
    ys = polygon[1::2]
    x_min, x_max = min(xs), max(xs)
    y_min, y_max = min(ys), max(ys)
    return x_min, y_min, x_max, y_max

def scale_polygon(polygon, scale_x, scale_y):
    return [polygon[i] * (scale_x if i % 2 == 0 else scale_y) for i in range(8)]

def generate_azure_overlay_html(image_path, json_path, output_path):
    # Load image size
    img_width, img_height = load_image_size(image_path)

    # Load JSON
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Get page dimensions from JSON (assume first page)
    page = data['pages'][0]
    doc_width = page.get('width', img_width)
    doc_height = page.get('height', img_height)
    unit = page.get('unit', 'pixel')

    # Compute scaling factors
    scale_x = img_width / doc_width
    scale_y = img_height / doc_height

    # Extract words
    words = extract_words(data)

    # Generate HTML boxes
    boxes = []
    for word in words:
        poly = word['polygon']
        scaled_poly = scale_polygon(poly, scale_x, scale_y)
        x0, y0, x2, y2 = scaled_poly[0], scaled_poly[1], scaled_poly[4], scaled_poly[5]
        left = x0
        top = y0
        width = x2 - x0
        height = y2 - y0
        # Fallback for negative width/height
        width = abs(width)
        height = abs(height)
        style = f"left:{left:.2f}px;top:{top:.2f}px;width:{width:.2f}px;height:{height:.2f}px;"
        box_html = f'<span class="word-box" style="{style}">{word["text"]}</span>'
        boxes.append(box_html)

    # Use relative path for image in HTML
    img_src = os.path.relpath(image_path, os.path.dirname(output_path))

    html = HTML_TEMPLATE.format(
        img_width=img_width,
        img_height=img_height,
        img_src=img_src,
        boxes='\n'.join(boxes)
    )

    with open(output_path, 'w') as f:
        f.write(html)
    print(f"Overlay HTML written to {output_path}")

def generate_docling_overlay(image_path, json_path, output_path):
    """
    Generate an HTML file overlaying bounding boxes from the JSON on the image, with tooltips showing the extracted text on hover.
    Returns the HTML content as a string.
    """
    # Load image and encode as base64
    with open(image_path, "rb") as img_f:
        img_bytes = img_f.read()
        img_b64 = base64.b64encode(img_bytes).decode("utf-8")
    from PIL import Image as PILImage
    img = PILImage.open(image_path)
    img_width, img_height = img.size

    # Load JSON
    with open(json_path, "r") as f:
        doc = json.load(f)

    # Collect bounding boxes and texts
    boxes = []
    # Texts: red
    for text in doc.get("texts", []):
        for prov in text.get("prov", []):
            bbox = prov.get("bbox")
            if bbox:
                l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
                boxes.append({
                    "l": l,
                    "t": t,
                    "r": r,
                    "b": b,
                    "text": text.get("text", ""),
                    "type": "text"
                })
    # Pictures: green
    for pic in doc.get("pictures", []):
        for prov in pic.get("prov", []):
            bbox = prov.get("bbox")
            if bbox:
                l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
                boxes.append({
                    "l": l,
                    "t": t,
                    "r": r,
                    "b": b,
                    "text": pic.get("label", "picture"),
                    "type": "picture"
                })
    # Groups: blue (enclosing all children)
    def get_bbox_for_refs(refs, texts_by_ref):
        # Get all bboxes for the referenced texts (recursively for groups)
        bboxes = []
        for ref in refs:
            if ref["$ref"].startswith("#/texts/"):
                text = texts_by_ref.get(ref["$ref"])
                if text:
                    for prov in text.get("prov", []):
                        bbox = prov.get("bbox")
                        if bbox:
                            bboxes.append(bbox)
            elif ref["$ref"].startswith("#/groups/"):
                group = groups_by_ref.get(ref["$ref"])
                if group:
                    bboxes.extend(get_bbox_for_refs(group.get("children", []), texts_by_ref))
        return bboxes
    groups_by_ref = {g["self_ref"]: g for g in doc.get("groups", [])}
    texts_by_ref = {t["self_ref"]: t for t in doc.get("texts", [])}
    for group in doc.get("groups", []):
        bboxes = get_bbox_for_refs(group.get("children", []), texts_by_ref)
        if bboxes:
            l = min(b["l"] for b in bboxes)
            t = min(b["t"] for b in bboxes)
            r = max(b["r"] for b in bboxes)
            b_ = max(b["b"] for b in bboxes)
            boxes.append({
                "l": l,
                "t": t,
                "r": r,
                "b": b_,
                "text": group.get("label", "group"),
                "type": "group"
            })
    # Build HTML as a list of lines
    html_lines = [
        '<!DOCTYPE html>',
        '<html lang="en">',
        '<head>',
        '<meta charset="UTF-8">',
        f'<title>Overlay for {os.path.basename(image_path)}</title>',
        '<style>',
        f'''.container {{
    position: relative;
    width: {img_width}px;
    height: {img_height}px;
    background: #222;
  }}
  .overlay-img {{
    display: block;
    width: {img_width}px;
    height: {img_height}px;
  }}
  .bbox {{
    position: absolute;
    box-sizing: border-box;
    cursor: pointer;
  }}
  .bbox-text {{
    border: 2px solid red;
  }}
  .bbox-picture {{
    border: 2px solid green;
  }}
  .bbox-group {{
    border: 2px solid blue;
  }}
  .tooltip {{
    display: none;
    position: absolute;
    background: #fff;
    color: #222;
    border: 1px solid #888;
    padding: 6px 10px;
    border-radius: 4px;
    z-index: 10;
    pointer-events: none;
    max-width: 400px;
    font-size: 15px;
    box-shadow: 0 2px 8px rgba(0,0,0,0.2);
    white-space: pre-line;
  }}''',
        '</style>',
        '</head>',
        '<body>',
        f'<h2>Overlay for {os.path.basename(image_path)}</h2>',
        f'<div class="container" id="img-container">',
        f'  <img src="data:image/png;base64,{img_b64}" class="overlay-img" alt="source image">'
    ]
    # Add bounding boxes
    for i, box in enumerate(boxes):
        left = box["l"]
        top = box["t"]
        width = box["r"] - box["l"]
        height = box["b"] - box["t"]
        text = box["text"].replace('"', '&quot;').replace("'", "&#39;")
        box_class = f"bbox bbox-{box['type']}"
        html_lines.append(f'<div class="{box_class}" style="left:{left}px;top:{top}px;width:{width}px;height:{height}px;" data-tooltip="{text}" onmousemove="showTooltip(event, {i})" onmouseleave="hideTooltip()"></div>')
    html_lines.append('<div class="tooltip" id="tooltip"></div>')
    html_lines.append('</div>')
    html_lines.append('''<script>
const tooltip = document.getElementById('tooltip');
function showTooltip(e, idx) {
  const bbox = e.target;
  const text = bbox.getAttribute('data-tooltip');
  tooltip.innerText = text;
  tooltip.style.display = 'block';
  // Position tooltip near mouse, but inside container
  const container = document.getElementById('img-container');
  let x = e.clientX - container.getBoundingClientRect().left + 10;
  let y = e.clientY - container.getBoundingClientRect().top + 10;
  // Clamp to container
  x = Math.min(x, container.offsetWidth - tooltip.offsetWidth - 10);
  y = Math.min(y, container.offsetHeight - tooltip.offsetHeight - 10);
  tooltip.style.left = x + 'px';
  tooltip.style.top = y + 'px';
}
function hideTooltip() {
  tooltip.style.display = 'none';
}
</script>''')
    html_lines.append('</body></html>')
    html = '\n'.join(html_lines)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html)
    print(f"Overlay HTML written to {output_path}")
    return html

def main():
    parser = argparse.ArgumentParser(description="Generate HTML overlay for Azure Document Intelligence output.")
    parser.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
    parser.add_argument('--image', required=True, help='Path to scanned image file')
    parser.add_argument('--output', required=True, help='Path to output HTML file')
    args = parser.parse_args()
    generate_azure_overlay_html(args.image, args.json, args.output)

if __name__ == '__main__':
    main()