Pavan147 commited on
Commit
0214886
·
verified ·
1 Parent(s): 62320a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -127
app.py CHANGED
@@ -1,147 +1,65 @@
1
- # import gradio as gr
2
- # from transformers import AutoProcessor, AutoModelForImageTextToText
3
- # from PIL import Image
4
- # import re
5
-
6
- # # Load SmolDocling model & processor once
7
- # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
- # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
9
-
10
- # def extract_fcel_values_from_image(image, prompt_text):
11
- # """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
12
- # # Prepare prompt for the model
13
- # messages = [
14
- # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
15
- # ]
16
- # prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
17
- # inputs = processor(text=prompt, images=[image], return_tensors="pt")
18
-
19
- # # Generate output
20
- # outputs = model.generate(**inputs, max_new_tokens=2048)
21
- # prompt_length = inputs.input_ids.shape[1]
22
- # generated = outputs[:, prompt_length:]
23
- # result = processor.batch_decode(generated, skip_special_tokens=False)[0]
24
- # clean_text = result.replace("<end_of_utterance>", "").strip()
25
-
26
- # # Extract only <fcel> values
27
- # values = re.findall(r"<fcel>([\d.]+)", clean_text)
28
- # values = [float(v) for v in values] # convert to floats
29
-
30
- # return values, clean_text
31
-
32
- # def compare_images(image1, image2, prompt_text):
33
- # # Extract fcel values from both images
34
- # values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
35
- # values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
36
-
37
- # # Calculate accuracy
38
- # if len(values1) == len(values2) and values1 == values2:
39
- # accuracy = 100.0
40
- # else:
41
- # matches = sum(1 for a, b in zip(values1, values2) if a == b)
42
- # total = max(len(values1), len(values2))
43
- # accuracy = (matches / total) * 100 if total > 0 else 0
44
-
45
- # return {
46
- # "Extracted Values 1": values1,
47
- # "Extracted Values 2": values2,
48
- # "Accuracy (%)": accuracy
49
- # }
50
-
51
- # # Gradio UI
52
- # demo = gr.Interface(
53
- # fn=compare_images,
54
- # inputs=[
55
- # gr.Image(type="pil", label="Upload First Table Image"),
56
- # gr.Image(type="pil", label="Upload Second Table Image"),
57
- # gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
58
- # ],
59
- # outputs="json",
60
- # title="Table Data Accuracy Checker (SmolDocling)",
61
- # description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
62
- # )
63
-
64
- # demo.launch()
65
-
66
- import re
67
- import numpy as np
68
  import gradio as gr
69
  from transformers import AutoProcessor, AutoModelForImageTextToText
70
  from PIL import Image
 
71
 
72
- # Load model & processor once at startup
73
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
74
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
75
 
76
- def extract_values(docling_text):
77
- # Remove all <loc_*> tags
78
- cleaned = re.sub(r"<loc_\d+>", "", docling_text)
79
- # Split rows by <nl>
80
- rows = cleaned.split("<nl>")
81
- result = []
82
- for row in rows:
83
- if not row.strip():
84
- continue
85
- # Extract numbers inside <fcel> tags
86
- values = re.findall(r"<fcel>(.*?)<fcel>", row)
87
- float_values = [float(v) for v in values]
88
- result.append(float_values)
89
- return result
90
-
91
- def get_array_from_image(image, prompt_text):
92
  messages = [
93
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
94
  ]
95
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
96
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
97
- outputs = model.generate(**inputs, max_new_tokens=1024)
 
 
98
  prompt_length = inputs.input_ids.shape[1]
99
  generated = outputs[:, prompt_length:]
100
- raw_result = processor.batch_decode(generated, skip_special_tokens=False)[0]
101
- return extract_values(raw_result)
102
-
103
- def compare_arrays(arr1, arr2):
104
- # Flatten both arrays (assumes 2D list)
105
- flat1 = np.array(arr1).flatten()
106
- flat2 = np.array(arr2).flatten()
107
-
108
- # If shapes differ, compare only overlapping parts
109
- min_len = min(len(flat1), len(flat2))
110
- if min_len == 0:
111
- return 0.0 # no data to compare
112
-
113
- flat1 = flat1[:min_len]
114
- flat2 = flat2[:min_len]
115
-
116
- # Calculate similarity as 1 - normalized mean absolute error
117
- mae = np.mean(np.abs(flat1 - flat2))
118
- max_val = max(np.max(flat1), np.max(flat2), 1e-6) # avoid zero division
119
- similarity = 1 - (mae / max_val)
120
- similarity_percent = max(0, similarity) * 100 # clamp to >=0
121
-
122
- return round(similarity_percent, 2)
123
-
124
- def process_two_images(image1, image2, prompt_text):
125
- arr1 = get_array_from_image(image1, prompt_text)
126
- arr2 = get_array_from_image(image2, prompt_text)
127
- similarity = compare_arrays(arr1, arr2)
128
-
129
- return (
130
- f"Extracted values from Image 1:\n{arr1}\n\n"
131
- f"Extracted values from Image 2:\n{arr2}\n\n"
132
- f"Similarity Accuracy: {similarity} %"
133
- )
134
-
135
  demo = gr.Interface(
136
- fn=process_two_images,
137
  inputs=[
138
- gr.Image(type="pil", label="Upload Image 1"),
139
- gr.Image(type="pil", label="Upload Image 2"),
140
- gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
141
  ],
142
- outputs="text",
143
- title="SmolDocling Image Comparison",
144
- description="Upload two document images, extract numeric arrays, and compare their similarity."
145
  )
146
 
147
  demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from PIL import Image
4
+ import re
5
 
6
+ # Load SmolDocling model & processor once
7
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
9
 
10
+ def extract_fcel_values_from_image(image, prompt_text):
11
+ """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
12
+ # Prepare prompt for the model
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  messages = [
14
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
15
  ]
16
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
17
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
18
+
19
+ # Generate output
20
+ outputs = model.generate(**inputs, max_new_tokens=2048)
21
  prompt_length = inputs.input_ids.shape[1]
22
  generated = outputs[:, prompt_length:]
23
+ result = processor.batch_decode(generated, skip_special_tokens=False)[0]
24
+ clean_text = result.replace("<end_of_utterance>", "").strip()
25
+
26
+ # Extract only <fcel> values
27
+ values = re.findall(r"<fcel>([\d.]+)", clean_text)
28
+ values = [float(v) for v in values] # convert to floats
29
+
30
+ return values, clean_text
31
+
32
+ def compare_images(image1, image2, prompt_text):
33
+ # Extract fcel values from both images
34
+ values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
35
+ values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
36
+
37
+ # Calculate accuracy
38
+ if len(values1) == len(values2) and values1 == values2:
39
+ accuracy = 100.0
40
+ else:
41
+ matches = sum(1 for a, b in zip(values1, values2) if a == b)
42
+ total = max(len(values1), len(values2))
43
+ accuracy = (matches / total) * 100 if total > 0 else 0
44
+
45
+ return {
46
+ # "Extracted Values 1": values1,
47
+ # "Extracted Values 2": values2,
48
+ "Accuracy (%)": accuracy
49
+ }
50
+
51
+ # Gradio UI
 
 
 
 
 
 
52
  demo = gr.Interface(
53
+ fn=compare_images,
54
  inputs=[
55
+ gr.Image(type="pil", label="Upload First Table Image"),
56
+ gr.Image(type="pil", label="Upload Second Table Image"),
57
+ gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
58
  ],
59
+ outputs="json",
60
+ title="Table Data Accuracy Checker (SmolDocling)",
61
+ description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
62
  )
63
 
64
  demo.launch()
65
+