Docling_Image / app.py
Pavan147's picture
Update app.py
dbef28c verified
# import re
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# # Load model & processor once at startup
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def smoldocling_readimage(image, prompt_text="Convert to docling"):
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# outputs = model.generate(**inputs, max_new_tokens=1024)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# return result.replace("<end_of_utterance>", "").strip()
# def extract_numbers(docling_text):
# # Extract all floating numbers from the docling text using regex
# numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
# return list(map(float, numbers))
# def compare_outputs(img1, img2):
# # Extract docling text from both images
# output1 = smoldocling_readimage(img1)
# output2 = smoldocling_readimage(img2)
# # Extract numbers from both outputs
# nums1 = extract_numbers(output1)
# nums2 = extract_numbers(output2)
# # Compare numbers β€” find matching count based on position
# length = min(len(nums1), len(nums2))
# matches = sum(1 for i in range(length) if abs(nums1[i] - nums2[i]) < 1e-3)
# # Calculate similarity accuracy percentage
# total = max(len(nums1), len(nums2))
# accuracy = (matches / total) * 100 if total > 0 else 0
# # Prepare result text
# result_text = (
# f"Output for Image 1:\n{output1}\n\n"
# f"Output for Image 2:\n{output2}\n\n"
# f"Similarity Accuracy: {accuracy:.2f}%\n"
# f"Matching Values: {matches} out of {total}"
# )
# return result_text
# # Gradio UI: take 2 images, output similarity report
# demo = gr.Interface(
# fn=compare_outputs,
# inputs=[
# gr.Image(type="pil", label="Upload Image 1"),
# gr.Image(type="pil", label="Upload Image 2"),
# ],
# outputs="text",
# title="SmolDocling Image Comparison",
# description="Upload two document images. This app extracts data from both and compares similarity."
# )
# demo.launch()
import re
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def smoldocling_readimage(image, prompt_text="Convert to docling"):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return result.replace("<end_of_utterance>", "").strip()
def extract_numbers(docling_text):
# Extract all floating numbers from the docling text
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
return list(map(float, numbers))
def compare_outputs(img1, img2):
# Get outputs
output1 = smoldocling_readimage(img1)
output2 = smoldocling_readimage(img2)
# Extract numbers
nums1 = extract_numbers(output1)
nums2 = extract_numbers(output2)
length = min(len(nums1), len(nums2))
matches = 0
mismatches = []
for i in range(length):
if abs(nums1[i] - nums2[i]) < 1e-3:
matches += 1
else:
mismatches.append(f"Pos {i+1}: {nums1[i]} β‰  {nums2[i]}")
total = max(len(nums1), len(nums2))
accuracy = (matches / total) * 100 if total > 0 else 0
mismatch_text = "\n".join(mismatches) if mismatches else "βœ… All values match."
result_text = (
f"πŸ“„ Output for Image 1:\n{output1}\n\n"
f"πŸ“„ Output for Image 2:\n{output2}\n\n"
f"πŸ” Similarity Accuracy: {accuracy:.2f}%\n"
f"βœ… Matching Values: {matches} / {total}\n"
f"❌ Mismatches:\n{mismatch_text}"
)
return result_text
# Gradio UI
demo = gr.Interface(
fn=compare_outputs,
inputs=[
gr.Image(type="pil", label="Upload Image 1"),
gr.Image(type="pil", label="Upload Image 2"),
],
outputs="text",
title="SmolDocling Image Comparison",
description="Upload two document images to extract values and compare similarity, with detailed mismatches."
)
demo.launch()