# import gradio as gr # from transformers import AutoProcessor, AutoModelForImageTextToText # from PIL import Image # import re # # Load SmolDocling model & processor once # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") # def extract_fcel_values_from_image(image, prompt_text): # """Run SmolDocling on an image and return numeric values inside tags.""" # # Prepare prompt for the model # messages = [ # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} # ] # prompt = processor.apply_chat_template(messages, add_generation_prompt=True) # inputs = processor(text=prompt, images=[image], return_tensors="pt") # # Generate output # outputs = model.generate(**inputs, max_new_tokens=2048) # prompt_length = inputs.input_ids.shape[1] # generated = outputs[:, prompt_length:] # result = processor.batch_decode(generated, skip_special_tokens=False)[0] # clean_text = result.replace("", "").strip() # # Extract only values # values = re.findall(r"([\d.]+)", clean_text) # values = [float(v) for v in values] # convert to floats # return values, clean_text # def compare_images(image1, image2, prompt_text): # # Extract fcel values from both images # values1, raw1 = extract_fcel_values_from_image(image1, prompt_text) # values2, raw2 = extract_fcel_values_from_image(image2, prompt_text) # # Calculate accuracy # if len(values1) == len(values2) and values1 == values2: # accuracy = 100.0 # else: # matches = sum(1 for a, b in zip(values1, values2) if a == b) # total = max(len(values1), len(values2)) # accuracy = (matches / total) * 100 if total > 0 else 0 # return { # "Extracted Values 1": values1, # "Extracted Values 2": values2, # "Accuracy (%)": accuracy # } # # Gradio UI # demo = gr.Interface( # fn=compare_images, # inputs=[ # gr.Image(type="pil", label="Upload First Table Image"), # gr.Image(type="pil", label="Upload Second Table Image"), # gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt") # ], # outputs="json", # title="Table Data Accuracy Checker (SmolDocling)", # description="Uploads two table images, extracts only values from OTSL output, and compares them for accuracy." # ) # demo.launch() import re import numpy as np import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image # Load model & processor once at startup processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") def extract_values(docling_text): # Remove all tags cleaned = re.sub(r"", "", docling_text) # Split rows by rows = cleaned.split("") result = [] for row in rows: if not row.strip(): continue # Extract numbers inside tags values = re.findall(r"(.*?)", row) float_values = [float(v) for v in values] result.append(float_values) return result def get_array_from_image(image, prompt_text): messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=1024) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] raw_result = processor.batch_decode(generated, skip_special_tokens=False)[0] return extract_values(raw_result) def compare_arrays(arr1, arr2): # Flatten both arrays (assumes 2D list) flat1 = np.array(arr1).flatten() flat2 = np.array(arr2).flatten() # If shapes differ, compare only overlapping parts min_len = min(len(flat1), len(flat2)) if min_len == 0: return 0.0 # no data to compare flat1 = flat1[:min_len] flat2 = flat2[:min_len] # Calculate similarity as 1 - normalized mean absolute error mae = np.mean(np.abs(flat1 - flat2)) max_val = max(np.max(flat1), np.max(flat2), 1e-6) # avoid zero division similarity = 1 - (mae / max_val) similarity_percent = max(0, similarity) * 100 # clamp to >=0 return round(similarity_percent, 2) def process_two_images(image1, image2, prompt_text): arr1 = get_array_from_image(image1, prompt_text) arr2 = get_array_from_image(image2, prompt_text) similarity = compare_arrays(arr1, arr2) return ( f"Extracted values from Image 1:\n{arr1}\n\n" f"Extracted values from Image 2:\n{arr2}\n\n" f"Similarity Accuracy: {similarity} %" ) demo = gr.Interface( fn=process_two_images, inputs=[ gr.Image(type="pil", label="Upload Image 1"), gr.Image(type="pil", label="Upload Image 2"), gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"), ], outputs="text", title="SmolDocling Image Comparison", description="Upload two document images, extract numeric arrays, and compare their similarity." ) demo.launch()