# import re # import gradio as gr # from transformers import AutoProcessor, AutoModelForImageTextToText # from PIL import Image # # Load model & processor once at startup # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") # def smoldocling_readimage(image, prompt_text="Convert to docling"): # messages = [ # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} # ] # prompt = processor.apply_chat_template(messages, add_generation_prompt=True) # inputs = processor(text=prompt, images=[image], return_tensors="pt") # outputs = model.generate(**inputs, max_new_tokens=1024) # prompt_length = inputs.input_ids.shape[1] # generated = outputs[:, prompt_length:] # result = processor.batch_decode(generated, skip_special_tokens=False)[0] # return result.replace("", "").strip() # def extract_numbers(docling_text): # # Extract all floating numbers from the docling text using regex # numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text) # return list(map(float, numbers)) # def compare_outputs(img1, img2): # # Extract docling text from both images # output1 = smoldocling_readimage(img1) # output2 = smoldocling_readimage(img2) # # Extract numbers from both outputs # nums1 = extract_numbers(output1) # nums2 = extract_numbers(output2) # # Compare numbers — find matching count based on position # length = min(len(nums1), len(nums2)) # matches = sum(1 for i in range(length) if abs(nums1[i] - nums2[i]) < 1e-3) # # Calculate similarity accuracy percentage # total = max(len(nums1), len(nums2)) # accuracy = (matches / total) * 100 if total > 0 else 0 # # Prepare result text # result_text = ( # f"Output for Image 1:\n{output1}\n\n" # f"Output for Image 2:\n{output2}\n\n" # f"Similarity Accuracy: {accuracy:.2f}%\n" # f"Matching Values: {matches} out of {total}" # ) # return result_text # # Gradio UI: take 2 images, output similarity report # demo = gr.Interface( # fn=compare_outputs, # inputs=[ # gr.Image(type="pil", label="Upload Image 1"), # gr.Image(type="pil", label="Upload Image 2"), # ], # outputs="text", # title="SmolDocling Image Comparison", # description="Upload two document images. This app extracts data from both and compares similarity." # ) # demo.launch() import re import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image # Load model & processor once at startup processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") def smoldocling_readimage(image, prompt_text="Convert to docling"): messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=1024) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] result = processor.batch_decode(generated, skip_special_tokens=False)[0] return result.replace("", "").strip() def extract_numbers(docling_text): # Extract all floating numbers from the docling text numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text) return list(map(float, numbers)) def compare_outputs(img1, img2): # Get outputs output1 = smoldocling_readimage(img1) output2 = smoldocling_readimage(img2) # Extract numbers nums1 = extract_numbers(output1) nums2 = extract_numbers(output2) length = min(len(nums1), len(nums2)) matches = 0 mismatches = [] for i in range(length): if abs(nums1[i] - nums2[i]) < 1e-3: matches += 1 else: mismatches.append(f"Pos {i+1}: {nums1[i]} ≠ {nums2[i]}") total = max(len(nums1), len(nums2)) accuracy = (matches / total) * 100 if total > 0 else 0 mismatch_text = "\n".join(mismatches) if mismatches else "✅ All values match." result_text = ( f"📄 Output for Image 1:\n{output1}\n\n" f"📄 Output for Image 2:\n{output2}\n\n" f"🔍 Similarity Accuracy: {accuracy:.2f}%\n" f"✅ Matching Values: {matches} / {total}\n" f"❌ Mismatches:\n{mismatch_text}" ) return result_text # Gradio UI demo = gr.Interface( fn=compare_outputs, inputs=[ gr.Image(type="pil", label="Upload Image 1"), gr.Image(type="pil", label="Upload Image 2"), ], outputs="text", title="SmolDocling Image Comparison", description="Upload two document images to extract values and compare similarity, with detailed mismatches." ) demo.launch()