File size: 5,516 Bytes
fcd0714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62320a7
8e81891
62320a7
d887fd5
99c8757
 
 
fcd0714
99c8757
 
 
8e81891
 
 
 
 
 
 
 
 
 
 
 
 
 
4e8d812
62320a7
99c8757
 
 
 
 
fcd0714
99c8757
 
8e81891
62320a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99c8757
 
62320a7
99c8757
62320a7
 
fcd0714
99c8757
8e81891
62320a7
 
99c8757
 
a62604d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# import re

# # Load SmolDocling model & processor once
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

# def extract_fcel_values_from_image(image, prompt_text):
#     """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
#     # Prepare prompt for the model
#     messages = [
#         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
#     ]
#     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
#     inputs = processor(text=prompt, images=[image], return_tensors="pt")

#     # Generate output
#     outputs = model.generate(**inputs, max_new_tokens=2048)
#     prompt_length = inputs.input_ids.shape[1]
#     generated = outputs[:, prompt_length:]
#     result = processor.batch_decode(generated, skip_special_tokens=False)[0]
#     clean_text = result.replace("<end_of_utterance>", "").strip()

#     # Extract only <fcel> values
#     values = re.findall(r"<fcel>([\d.]+)", clean_text)
#     values = [float(v) for v in values]  # convert to floats

#     return values, clean_text

# def compare_images(image1, image2, prompt_text):
#     # Extract fcel values from both images
#     values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
#     values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)

#     # Calculate accuracy
#     if len(values1) == len(values2) and values1 == values2:
#         accuracy = 100.0
#     else:
#         matches = sum(1 for a, b in zip(values1, values2) if a == b)
#         total = max(len(values1), len(values2))
#         accuracy = (matches / total) * 100 if total > 0 else 0

#     return {
#         "Extracted Values 1": values1,
#         "Extracted Values 2": values2,
#         "Accuracy (%)": accuracy
#     }

# # Gradio UI
# demo = gr.Interface(
#     fn=compare_images,
#     inputs=[
#         gr.Image(type="pil", label="Upload First Table Image"),
#         gr.Image(type="pil", label="Upload Second Table Image"),
#         gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
#     ],
#     outputs="json",
#     title="Table Data Accuracy Checker (SmolDocling)",
#     description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
# )

# demo.launch()

import re
import numpy as np
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image

# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

def extract_values(docling_text):
    # Remove all <loc_*> tags
    cleaned = re.sub(r"<loc_\d+>", "", docling_text)
    # Split rows by <nl>
    rows = cleaned.split("<nl>")
    result = []
    for row in rows:
        if not row.strip():
            continue
        # Extract numbers inside <fcel> tags
        values = re.findall(r"<fcel>(.*?)<fcel>", row)
        float_values = [float(v) for v in values]
        result.append(float_values)
    return result

def get_array_from_image(image, prompt_text):
    messages = [
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=1024)
    prompt_length = inputs.input_ids.shape[1]
    generated = outputs[:, prompt_length:]
    raw_result = processor.batch_decode(generated, skip_special_tokens=False)[0]
    return extract_values(raw_result)

def compare_arrays(arr1, arr2):
    # Flatten both arrays (assumes 2D list)
    flat1 = np.array(arr1).flatten()
    flat2 = np.array(arr2).flatten()
    
    # If shapes differ, compare only overlapping parts
    min_len = min(len(flat1), len(flat2))
    if min_len == 0:
        return 0.0  # no data to compare
    
    flat1 = flat1[:min_len]
    flat2 = flat2[:min_len]
    
    # Calculate similarity as 1 - normalized mean absolute error
    mae = np.mean(np.abs(flat1 - flat2))
    max_val = max(np.max(flat1), np.max(flat2), 1e-6)  # avoid zero division
    similarity = 1 - (mae / max_val)
    similarity_percent = max(0, similarity) * 100  # clamp to >=0
    
    return round(similarity_percent, 2)

def process_two_images(image1, image2, prompt_text):
    arr1 = get_array_from_image(image1, prompt_text)
    arr2 = get_array_from_image(image2, prompt_text)
    similarity = compare_arrays(arr1, arr2)
    
    return (
        f"Extracted values from Image 1:\n{arr1}\n\n"
        f"Extracted values from Image 2:\n{arr2}\n\n"
        f"Similarity Accuracy: {similarity} %"
    )

demo = gr.Interface(
    fn=process_two_images,
    inputs=[
        gr.Image(type="pil", label="Upload Image 1"),
        gr.Image(type="pil", label="Upload Image 2"),
        gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
    ],
    outputs="text",
    title="SmolDocling Image Comparison",
    description="Upload two document images, extract numeric arrays, and compare their similarity."
)

demo.launch()