Pavan147 commited on
Commit
b85af28
Β·
verified Β·
1 Parent(s): 8dc569d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -98
app.py CHANGED
@@ -1,101 +1,3 @@
1
- # import gradio as gr
2
- # from transformers import AutoProcessor, AutoModelForImageTextToText
3
- # from PIL import Image
4
- # import re
5
-
6
- # # Load SmolDocling model & processor once
7
- # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
- # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
9
-
10
- # def extract_fcel_values_from_image(image, prompt_text):
11
- # """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
12
- # # Prepare prompt for the model
13
- # messages = [
14
- # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
15
- # ]
16
- # prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
17
- # inputs = processor(text=prompt, images=[image], return_tensors="pt")
18
-
19
- # # Generate output
20
- # outputs = model.generate(**inputs, max_new_tokens=2048)
21
- # prompt_length = inputs.input_ids.shape[1]
22
- # generated = outputs[:, prompt_length:]
23
- # result = processor.batch_decode(generated, skip_special_tokens=False)[0]
24
- # clean_text = result.replace("<end_of_utterance>", "").strip()
25
-
26
- # # Extract only <fcel> values
27
- # values = re.findall(r"<fcel>([\d.]+)", clean_text)
28
- # values = [float(v) for v in values] # convert to floats
29
-
30
- # return values, clean_text
31
-
32
- # def compare_images(image1, image2, prompt_text):
33
- # # Extract fcel values from both images
34
- # values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
35
- # values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
36
-
37
- # # Calculate accuracy
38
- # if len(values1) == len(values2) and values1 == values2:
39
- # accuracy = 100.0
40
- # else:
41
- # matches = sum(1 for a, b in zip(values1, values2) if a == b)
42
- # total = max(len(values1), len(values2))
43
- # accuracy = (matches / total) * 100 if total > 0 else 0
44
-
45
- # return {
46
- # # "Extracted Values 1": values1,
47
- # # "Extracted Values 2": values2,
48
- # "Accuracy (%)": accuracy
49
- # }
50
-
51
- # # Gradio UI
52
- # demo = gr.Interface(
53
- # fn=compare_images,
54
- # inputs=[
55
- # gr.Image(type="pil", label="Upload First Table Image"),
56
- # gr.Image(type="pil", label="Upload Second Table Image"),
57
- # gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
58
- # ],
59
- # outputs="json",
60
- # title="Table Data Accuracy Checker (SmolDocling)",
61
- # description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
62
- # )
63
-
64
- # demo.launch()
65
-
66
- # import gradio as gr
67
- # from transformers import AutoProcessor, AutoModelForImageTextToText
68
- # from PIL import Image
69
-
70
- # # Load model & processor once at startup
71
- # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
72
- # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
73
-
74
- # def smoldocling_readimage(image, prompt_text):
75
- # messages = [
76
- # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
77
- # ]
78
- # prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
79
- # inputs = processor(text=prompt, images=[image], return_tensors="pt")
80
- # outputs = model.generate(**inputs, max_new_tokens=1024)
81
- # prompt_length = inputs.input_ids.shape[1]
82
- # generated = outputs[:, prompt_length:]
83
- # result = processor.batch_decode(generated, skip_special_tokens=False)[0]
84
- # return result.replace("<end_of_utterance>", "").strip()
85
-
86
- # # Gradio UI
87
- # demo = gr.Interface(
88
- # fn=smoldocling_readimage,
89
- # inputs=[
90
- # gr.Image(type="pil", label="Upload Image"),
91
- # gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
92
- # ],
93
- # outputs="html",
94
- # title="SmolDocling Web App",
95
- # description="Upload a document image and convert it to structured docling format."
96
- # )
97
-
98
- # demo.launch()
99
 
100
  import re
101
  import gradio as gr
@@ -162,3 +64,77 @@ demo = gr.Interface(
162
  )
163
 
164
  demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  import re
3
  import gradio as gr
 
64
  )
65
 
66
  demo.launch()
67
+
68
+
69
+ import re
70
+ import gradio as gr
71
+ from transformers import AutoProcessor, AutoModelForImageTextToText
72
+ from PIL import Image
73
+
74
+ # Load model & processor once at startup
75
+ processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
76
+ model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
77
+
78
+ def smoldocling_readimage(image, prompt_text="Convert to docling"):
79
+ messages = [
80
+ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
81
+ ]
82
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
83
+ inputs = processor(text=prompt, images=[image], return_tensors="pt")
84
+ outputs = model.generate(**inputs, max_new_tokens=1024)
85
+ prompt_length = inputs.input_ids.shape[1]
86
+ generated = outputs[:, prompt_length:]
87
+ result = processor.batch_decode(generated, skip_special_tokens=False)[0]
88
+ return result.replace("<end_of_utterance>", "").strip()
89
+
90
+ def extract_numbers(docling_text):
91
+ # Extract all floating numbers from the docling text
92
+ numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
93
+ return list(map(float, numbers))
94
+
95
+ def compare_outputs(img1, img2):
96
+ # Get outputs
97
+ output1 = smoldocling_readimage(img1)
98
+ output2 = smoldocling_readimage(img2)
99
+
100
+ # Extract numbers
101
+ nums1 = extract_numbers(output1)
102
+ nums2 = extract_numbers(output2)
103
+
104
+ length = min(len(nums1), len(nums2))
105
+ matches = 0
106
+ mismatches = []
107
+
108
+ for i in range(length):
109
+ if abs(nums1[i] - nums2[i]) < 1e-3:
110
+ matches += 1
111
+ else:
112
+ mismatches.append(f"Pos {i+1}: {nums1[i]} β‰  {nums2[i]}")
113
+
114
+ total = max(len(nums1), len(nums2))
115
+ accuracy = (matches / total) * 100 if total > 0 else 0
116
+
117
+ mismatch_text = "\n".join(mismatches) if mismatches else "βœ… All values match."
118
+
119
+ result_text = (
120
+ f"πŸ“„ Output for Image 1:\n{output1}\n\n"
121
+ f"πŸ“„ Output for Image 2:\n{output2}\n\n"
122
+ f"πŸ” Similarity Accuracy: {accuracy:.2f}%\n"
123
+ f"βœ… Matching Values: {matches} / {total}\n"
124
+ f"❌ Mismatches:\n{mismatch_text}"
125
+ )
126
+ return result_text
127
+
128
+ # Gradio UI
129
+ demo = gr.Interface(
130
+ fn=compare_outputs,
131
+ inputs=[
132
+ gr.Image(type="pil", label="Upload Image 1"),
133
+ gr.Image(type="pil", label="Upload Image 2"),
134
+ ],
135
+ outputs="text",
136
+ title="SmolDocling Image Comparison",
137
+ description="Upload two document images to extract values and compare similarity, with detailed mismatches."
138
+ )
139
+
140
+ demo.launch()