import re import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image # Load model & processor once at startup processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") def smoldocling_readimage(image, prompt_text="Convert to docling"): messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=1024) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] result = processor.batch_decode(generated, skip_special_tokens=False)[0] return result.replace("", "").strip() def extract_numbers(docling_text): # Extract all floating numbers from the docling text using regex numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text) return list(map(float, numbers)) def compare_outputs(img1, img2): # Extract docling text from both images output1 = smoldocling_readimage(img1) output2 = smoldocling_readimage(img2) # Extract numbers from both outputs nums1 = extract_numbers(output1) nums2 = extract_numbers(output2) # Compare numbers — find matching count based on position length = min(len(nums1), len(nums2)) matches = sum(1 for i in range(length) if abs(nums1[i] - nums2[i]) < 1e-3) # Calculate similarity accuracy percentage total = max(len(nums1), len(nums2)) accuracy = (matches / total) * 100 if total > 0 else 0 # Prepare result text result_text = ( f"Output for Image 1:\n{output1}\n\n" f"Output for Image 2:\n{output2}\n\n" f"Similarity Accuracy: {accuracy:.2f}%\n" f"Matching Values: {matches} out of {total}" ) return result_text # Gradio UI: take 2 images, output similarity report demo = gr.Interface( fn=compare_outputs, inputs=[ gr.Image(type="pil", label="Upload Image 1"), gr.Image(type="pil", label="Upload Image 2"), ], outputs="text", title="SmolDocling Image Comparison", description="Upload two document images. This app extracts data from both and compares similarity." ) demo.launch() import re import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image # Load model & processor once at startup processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") def smoldocling_readimage(image, prompt_text="Convert to docling"): messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=1024) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] result = processor.batch_decode(generated, skip_special_tokens=False)[0] return result.replace("", "").strip() def extract_numbers(docling_text): # Extract all floating numbers from the docling text numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text) return list(map(float, numbers)) def compare_outputs(img1, img2): # Get outputs output1 = smoldocling_readimage(img1) output2 = smoldocling_readimage(img2) # Extract numbers nums1 = extract_numbers(output1) nums2 = extract_numbers(output2) length = min(len(nums1), len(nums2)) matches = 0 mismatches = [] for i in range(length): if abs(nums1[i] - nums2[i]) < 1e-3: matches += 1 else: mismatches.append(f"Pos {i+1}: {nums1[i]} ≠ {nums2[i]}") total = max(len(nums1), len(nums2)) accuracy = (matches / total) * 100 if total > 0 else 0 mismatch_text = "\n".join(mismatches) if mismatches else "✅ All values match." result_text = ( f"📄 Output for Image 1:\n{output1}\n\n" f"📄 Output for Image 2:\n{output2}\n\n" f"🔍 Similarity Accuracy: {accuracy:.2f}%\n" f"✅ Matching Values: {matches} / {total}\n" f"❌ Mismatches:\n{mismatch_text}" ) return result_text # Gradio UI demo = gr.Interface( fn=compare_outputs, inputs=[ gr.Image(type="pil", label="Upload Image 1"), gr.Image(type="pil", label="Upload Image 2"), ], outputs="text", title="SmolDocling Image Comparison", description="Upload two document images to extract values and compare similarity, with detailed mismatches." ) demo.launch()