Pavan147 commited on
Commit
fcd0714
·
verified ·
1 Parent(s): a62604d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -40
app.py CHANGED
@@ -1,64 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from PIL import Image
4
- import re
5
 
6
- # Load SmolDocling model & processor once
7
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
9
 
10
- def extract_fcel_values_from_image(image, prompt_text):
11
- """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
12
- # Prepare prompt for the model
13
  messages = [
14
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
15
  ]
16
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
17
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
18
-
19
- # Generate output
20
- outputs = model.generate(**inputs, max_new_tokens=2048)
21
  prompt_length = inputs.input_ids.shape[1]
22
  generated = outputs[:, prompt_length:]
23
  result = processor.batch_decode(generated, skip_special_tokens=False)[0]
24
- clean_text = result.replace("<end_of_utterance>", "").strip()
25
-
26
- # Extract only <fcel> values
27
- values = re.findall(r"<fcel>([\d.]+)", clean_text)
28
- values = [float(v) for v in values] # convert to floats
29
 
30
- return values, clean_text
31
-
32
- def compare_images(image1, image2, prompt_text):
33
- # Extract fcel values from both images
34
- values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
35
- values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
36
-
37
- # Calculate accuracy
38
- if len(values1) == len(values2) and values1 == values2:
39
- accuracy = 100.0
40
- else:
41
- matches = sum(1 for a, b in zip(values1, values2) if a == b)
42
- total = max(len(values1), len(values2))
43
- accuracy = (matches / total) * 100 if total > 0 else 0
44
-
45
- return {
46
- "Extracted Values 1": values1,
47
- "Extracted Values 2": values2,
48
- "Accuracy (%)": accuracy
49
- }
50
 
51
  # Gradio UI
52
  demo = gr.Interface(
53
- fn=compare_images,
54
  inputs=[
55
- gr.Image(type="pil", label="Upload First Table Image"),
56
- gr.Image(type="pil", label="Upload Second Table Image"),
57
- gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
58
  ],
59
- outputs="json",
60
- title="Table Data Accuracy Checker (SmolDocling)",
61
- description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
62
  )
63
 
64
  demo.launch()
 
 
1
+ # import gradio as gr
2
+ # from transformers import AutoProcessor, AutoModelForImageTextToText
3
+ # from PIL import Image
4
+ # import re
5
+
6
+ # # Load SmolDocling model & processor once
7
+ # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
+ # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
9
+
10
+ # def extract_fcel_values_from_image(image, prompt_text):
11
+ # """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
12
+ # # Prepare prompt for the model
13
+ # messages = [
14
+ # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
15
+ # ]
16
+ # prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
17
+ # inputs = processor(text=prompt, images=[image], return_tensors="pt")
18
+
19
+ # # Generate output
20
+ # outputs = model.generate(**inputs, max_new_tokens=2048)
21
+ # prompt_length = inputs.input_ids.shape[1]
22
+ # generated = outputs[:, prompt_length:]
23
+ # result = processor.batch_decode(generated, skip_special_tokens=False)[0]
24
+ # clean_text = result.replace("<end_of_utterance>", "").strip()
25
+
26
+ # # Extract only <fcel> values
27
+ # values = re.findall(r"<fcel>([\d.]+)", clean_text)
28
+ # values = [float(v) for v in values] # convert to floats
29
+
30
+ # return values, clean_text
31
+
32
+ # def compare_images(image1, image2, prompt_text):
33
+ # # Extract fcel values from both images
34
+ # values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
35
+ # values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
36
+
37
+ # # Calculate accuracy
38
+ # if len(values1) == len(values2) and values1 == values2:
39
+ # accuracy = 100.0
40
+ # else:
41
+ # matches = sum(1 for a, b in zip(values1, values2) if a == b)
42
+ # total = max(len(values1), len(values2))
43
+ # accuracy = (matches / total) * 100 if total > 0 else 0
44
+
45
+ # return {
46
+ # "Extracted Values 1": values1,
47
+ # "Extracted Values 2": values2,
48
+ # "Accuracy (%)": accuracy
49
+ # }
50
+
51
+ # # Gradio UI
52
+ # demo = gr.Interface(
53
+ # fn=compare_images,
54
+ # inputs=[
55
+ # gr.Image(type="pil", label="Upload First Table Image"),
56
+ # gr.Image(type="pil", label="Upload Second Table Image"),
57
+ # gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
58
+ # ],
59
+ # outputs="json",
60
+ # title="Table Data Accuracy Checker (SmolDocling)",
61
+ # description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
62
+ # )
63
+
64
+ # demo.launch()
65
+
66
  import gradio as gr
67
  from transformers import AutoProcessor, AutoModelForImageTextToText
68
  from PIL import Image
69
+ import json
70
 
71
+ # Load model & processor once at startup
72
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
73
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
74
 
75
+ def smoldocling_readimage(image, prompt_text):
 
 
76
  messages = [
77
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
78
  ]
79
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
80
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
81
+ outputs = model.generate(**inputs, max_new_tokens=1024)
 
 
82
  prompt_length = inputs.input_ids.shape[1]
83
  generated = outputs[:, prompt_length:]
84
  result = processor.batch_decode(generated, skip_special_tokens=False)[0]
85
+ clean_result = result.replace("<end_of_utterance>", "").strip()
 
 
 
 
86
 
87
+ # Try to parse as JSON
88
+ try:
89
+ json_result = json.loads(clean_result)
90
+ return json_result
91
+ except json.JSONDecodeError:
92
+ return {"error": "Output is not valid JSON", "raw_output": clean_result}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  # Gradio UI
95
  demo = gr.Interface(
96
+ fn=smoldocling_readimage,
97
  inputs=[
98
+ gr.Image(type="pil", label="Upload Image"),
99
+ gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
 
100
  ],
101
+ outputs=gr.JSON(),
102
+ title="SmolDocling Web App",
103
+ description="Upload a document image and convert it to structured docling format."
104
  )
105
 
106
  demo.launch()
107
+