import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image import re # Load SmolDocling model & processor once processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") def extract_fcel_values_from_image(image, prompt_text): """Run SmolDocling on an image and return numeric values inside tags.""" # Prepare prompt for the model messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") # Generate output outputs = model.generate(**inputs, max_new_tokens=2048) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] result = processor.batch_decode(generated, skip_special_tokens=False)[0] clean_text = result.replace("", "").strip() # Extract only values values = re.findall(r"([\d.]+)", clean_text) values = [float(v) for v in values] # convert to floats return values, clean_text def compare_images(image1, image2, prompt_text): # Extract fcel values from both images values1, raw1 = extract_fcel_values_from_image(image1, prompt_text) values2, raw2 = extract_fcel_values_from_image(image2, prompt_text) # Calculate accuracy if len(values1) == len(values2) and values1 == values2: accuracy = 100.0 else: matches = sum(1 for a, b in zip(values1, values2) if a == b) total = max(len(values1), len(values2)) accuracy = (matches / total) * 100 if total > 0 else 0 return { "Extracted Values 1": values1, "Extracted Values 2": values2, "Accuracy (%)": accuracy } # Gradio UI demo = gr.Interface( fn=compare_images, inputs=[ gr.Image(type="pil", label="Upload First Table Image"), gr.Image(type="pil", label="Upload Second Table Image"), gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt") ], outputs="json", title="Table Data Accuracy Checker (SmolDocling)", description="Uploads two table images, extracts only values from OTSL output, and compares them for accuracy." ) demo.launch()