Spaces:
Sleeping
Sleeping
File size: 5,516 Bytes
fcd0714 62320a7 8e81891 62320a7 d887fd5 99c8757 fcd0714 99c8757 8e81891 4e8d812 62320a7 99c8757 fcd0714 99c8757 8e81891 62320a7 99c8757 62320a7 99c8757 62320a7 fcd0714 99c8757 8e81891 62320a7 99c8757 a62604d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# import re
# # Load SmolDocling model & processor once
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def extract_fcel_values_from_image(image, prompt_text):
# """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
# # Prepare prompt for the model
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# # Generate output
# outputs = model.generate(**inputs, max_new_tokens=2048)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# clean_text = result.replace("<end_of_utterance>", "").strip()
# # Extract only <fcel> values
# values = re.findall(r"<fcel>([\d.]+)", clean_text)
# values = [float(v) for v in values] # convert to floats
# return values, clean_text
# def compare_images(image1, image2, prompt_text):
# # Extract fcel values from both images
# values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
# values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
# # Calculate accuracy
# if len(values1) == len(values2) and values1 == values2:
# accuracy = 100.0
# else:
# matches = sum(1 for a, b in zip(values1, values2) if a == b)
# total = max(len(values1), len(values2))
# accuracy = (matches / total) * 100 if total > 0 else 0
# return {
# "Extracted Values 1": values1,
# "Extracted Values 2": values2,
# "Accuracy (%)": accuracy
# }
# # Gradio UI
# demo = gr.Interface(
# fn=compare_images,
# inputs=[
# gr.Image(type="pil", label="Upload First Table Image"),
# gr.Image(type="pil", label="Upload Second Table Image"),
# gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
# ],
# outputs="json",
# title="Table Data Accuracy Checker (SmolDocling)",
# description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
# )
# demo.launch()
import re
import numpy as np
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def extract_values(docling_text):
# Remove all <loc_*> tags
cleaned = re.sub(r"<loc_\d+>", "", docling_text)
# Split rows by <nl>
rows = cleaned.split("<nl>")
result = []
for row in rows:
if not row.strip():
continue
# Extract numbers inside <fcel> tags
values = re.findall(r"<fcel>(.*?)<fcel>", row)
float_values = [float(v) for v in values]
result.append(float_values)
return result
def get_array_from_image(image, prompt_text):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
raw_result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return extract_values(raw_result)
def compare_arrays(arr1, arr2):
# Flatten both arrays (assumes 2D list)
flat1 = np.array(arr1).flatten()
flat2 = np.array(arr2).flatten()
# If shapes differ, compare only overlapping parts
min_len = min(len(flat1), len(flat2))
if min_len == 0:
return 0.0 # no data to compare
flat1 = flat1[:min_len]
flat2 = flat2[:min_len]
# Calculate similarity as 1 - normalized mean absolute error
mae = np.mean(np.abs(flat1 - flat2))
max_val = max(np.max(flat1), np.max(flat2), 1e-6) # avoid zero division
similarity = 1 - (mae / max_val)
similarity_percent = max(0, similarity) * 100 # clamp to >=0
return round(similarity_percent, 2)
def process_two_images(image1, image2, prompt_text):
arr1 = get_array_from_image(image1, prompt_text)
arr2 = get_array_from_image(image2, prompt_text)
similarity = compare_arrays(arr1, arr2)
return (
f"Extracted values from Image 1:\n{arr1}\n\n"
f"Extracted values from Image 2:\n{arr2}\n\n"
f"Similarity Accuracy: {similarity} %"
)
demo = gr.Interface(
fn=process_two_images,
inputs=[
gr.Image(type="pil", label="Upload Image 1"),
gr.Image(type="pil", label="Upload Image 2"),
gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
],
outputs="text",
title="SmolDocling Image Comparison",
description="Upload two document images, extract numeric arrays, and compare their similarity."
)
demo.launch()
|