Spaces:
Sleeping
Sleeping
File size: 5,106 Bytes
1b7aff0 dbef28c 99c8757 dbef28c 6c102e5 dbef28c 8dc569d dbef28c 8dc569d dbef28c 8dc569d dbef28c 8dc569d dbef28c 8dc569d dbef28c 8dc569d dbef28c 8dc569d dbef28c 99c8757 dbef28c b85af28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# import re
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# # Load model & processor once at startup
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def smoldocling_readimage(image, prompt_text="Convert to docling"):
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# outputs = model.generate(**inputs, max_new_tokens=1024)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# return result.replace("<end_of_utterance>", "").strip()
# def extract_numbers(docling_text):
# # Extract all floating numbers from the docling text using regex
# numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
# return list(map(float, numbers))
# def compare_outputs(img1, img2):
# # Extract docling text from both images
# output1 = smoldocling_readimage(img1)
# output2 = smoldocling_readimage(img2)
# # Extract numbers from both outputs
# nums1 = extract_numbers(output1)
# nums2 = extract_numbers(output2)
# # Compare numbers β find matching count based on position
# length = min(len(nums1), len(nums2))
# matches = sum(1 for i in range(length) if abs(nums1[i] - nums2[i]) < 1e-3)
# # Calculate similarity accuracy percentage
# total = max(len(nums1), len(nums2))
# accuracy = (matches / total) * 100 if total > 0 else 0
# # Prepare result text
# result_text = (
# f"Output for Image 1:\n{output1}\n\n"
# f"Output for Image 2:\n{output2}\n\n"
# f"Similarity Accuracy: {accuracy:.2f}%\n"
# f"Matching Values: {matches} out of {total}"
# )
# return result_text
# # Gradio UI: take 2 images, output similarity report
# demo = gr.Interface(
# fn=compare_outputs,
# inputs=[
# gr.Image(type="pil", label="Upload Image 1"),
# gr.Image(type="pil", label="Upload Image 2"),
# ],
# outputs="text",
# title="SmolDocling Image Comparison",
# description="Upload two document images. This app extracts data from both and compares similarity."
# )
# demo.launch()
import re
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def smoldocling_readimage(image, prompt_text="Convert to docling"):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return result.replace("<end_of_utterance>", "").strip()
def extract_numbers(docling_text):
# Extract all floating numbers from the docling text
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
return list(map(float, numbers))
def compare_outputs(img1, img2):
# Get outputs
output1 = smoldocling_readimage(img1)
output2 = smoldocling_readimage(img2)
# Extract numbers
nums1 = extract_numbers(output1)
nums2 = extract_numbers(output2)
length = min(len(nums1), len(nums2))
matches = 0
mismatches = []
for i in range(length):
if abs(nums1[i] - nums2[i]) < 1e-3:
matches += 1
else:
mismatches.append(f"Pos {i+1}: {nums1[i]} β {nums2[i]}")
total = max(len(nums1), len(nums2))
accuracy = (matches / total) * 100 if total > 0 else 0
mismatch_text = "\n".join(mismatches) if mismatches else "β
All values match."
result_text = (
f"π Output for Image 1:\n{output1}\n\n"
f"π Output for Image 2:\n{output2}\n\n"
f"π Similarity Accuracy: {accuracy:.2f}%\n"
f"β
Matching Values: {matches} / {total}\n"
f"β Mismatches:\n{mismatch_text}"
)
return result_text
# Gradio UI
demo = gr.Interface(
fn=compare_outputs,
inputs=[
gr.Image(type="pil", label="Upload Image 1"),
gr.Image(type="pil", label="Upload Image 2"),
],
outputs="text",
title="SmolDocling Image Comparison",
description="Upload two document images to extract values and compare similarity, with detailed mismatches."
)
demo.launch()
|