Spaces:
Sleeping
Sleeping
File size: 4,996 Bytes
1b7aff0 d887fd5 6c102e5 99c8757 df46f51 99c8757 6c102e5 8dc569d 6c102e5 aa63203 8dc569d 99c8757 8dc569d 6c102e5 8dc569d 6c102e5 8dc569d 99c8757 1b7aff0 b85af28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import re
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def smoldocling_readimage(image, prompt_text="Convert to docling"):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return result.replace("<end_of_utterance>", "").strip()
def extract_numbers(docling_text):
# Extract all floating numbers from the docling text using regex
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
return list(map(float, numbers))
def compare_outputs(img1, img2):
# Extract docling text from both images
output1 = smoldocling_readimage(img1)
output2 = smoldocling_readimage(img2)
# Extract numbers from both outputs
nums1 = extract_numbers(output1)
nums2 = extract_numbers(output2)
# Compare numbers β find matching count based on position
length = min(len(nums1), len(nums2))
matches = sum(1 for i in range(length) if abs(nums1[i] - nums2[i]) < 1e-3)
# Calculate similarity accuracy percentage
total = max(len(nums1), len(nums2))
accuracy = (matches / total) * 100 if total > 0 else 0
# Prepare result text
result_text = (
f"Output for Image 1:\n{output1}\n\n"
f"Output for Image 2:\n{output2}\n\n"
f"Similarity Accuracy: {accuracy:.2f}%\n"
f"Matching Values: {matches} out of {total}"
)
return result_text
# Gradio UI: take 2 images, output similarity report
demo = gr.Interface(
fn=compare_outputs,
inputs=[
gr.Image(type="pil", label="Upload Image 1"),
gr.Image(type="pil", label="Upload Image 2"),
],
outputs="text",
title="SmolDocling Image Comparison",
description="Upload two document images. This app extracts data from both and compares similarity."
)
demo.launch()
import re
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def smoldocling_readimage(image, prompt_text="Convert to docling"):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return result.replace("<end_of_utterance>", "").strip()
def extract_numbers(docling_text):
# Extract all floating numbers from the docling text
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
return list(map(float, numbers))
def compare_outputs(img1, img2):
# Get outputs
output1 = smoldocling_readimage(img1)
output2 = smoldocling_readimage(img2)
# Extract numbers
nums1 = extract_numbers(output1)
nums2 = extract_numbers(output2)
length = min(len(nums1), len(nums2))
matches = 0
mismatches = []
for i in range(length):
if abs(nums1[i] - nums2[i]) < 1e-3:
matches += 1
else:
mismatches.append(f"Pos {i+1}: {nums1[i]} β {nums2[i]}")
total = max(len(nums1), len(nums2))
accuracy = (matches / total) * 100 if total > 0 else 0
mismatch_text = "\n".join(mismatches) if mismatches else "β
All values match."
result_text = (
f"π Output for Image 1:\n{output1}\n\n"
f"π Output for Image 2:\n{output2}\n\n"
f"π Similarity Accuracy: {accuracy:.2f}%\n"
f"β
Matching Values: {matches} / {total}\n"
f"β Mismatches:\n{mismatch_text}"
)
return result_text
# Gradio UI
demo = gr.Interface(
fn=compare_outputs,
inputs=[
gr.Image(type="pil", label="Upload Image 1"),
gr.Image(type="pil", label="Upload Image 2"),
],
outputs="text",
title="SmolDocling Image Comparison",
description="Upload two document images to extract values and compare similarity, with detailed mismatches."
)
demo.launch()
|